htsjdk.samtools.reference.ReferenceSequenceFileFactory Java Examples

The following examples show how to use htsjdk.samtools.reference.ReferenceSequenceFileFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RefRepo.java    From cramtools with Apache License 2.0 6 votes vote down vote up
@Override
public List<Entry> call() throws Exception {
	List<Entry> list = new ArrayList<RefRepo.Entry>();

	ReferenceSequenceFile rsFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(file);

	ReferenceSequence sequence = null;
	while ((sequence = rsFile.nextSequence()) != null) {
		sequence.getBases();

		Entry e = new Entry();
		e.md5 = Utils.calculateMD5String(sequence.getBases());
		e.file = "file://" + file.getAbsolutePath();
		e.name = sequence.getName();
		e.length = sequence.length();
		log.info(String.format("New entry: %s", e.toString()));
		list.add(e);
	}
	return list;
}
 
Example #2
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 6 votes vote down vote up
@Override
protected int doWork() {
	IOUtil.assertFileIsReadable(this.REFERENCE_SEQUENCE);
	IOUtil.assertFileIsWritable(this.OUTPUT);
	// validate that an index is present for the reference sequence, since it's required.
	final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE, true, true);
	if (!ref.isIndexed())
		throw new IllegalStateException ("Input fasta must be indexed.  You can do this by using samtools faidx to create an index");

	FastaSequenceFileWriter writer = new FastaSequenceFileWriter(OUTPUT, OUTPUT_LINE_LENGTH);
	if (this.CONTIG_PATTERN_TO_IGNORE!=null && !this.CONTIG_PATTERN_TO_IGNORE.isEmpty()) processByWholeContig(ref, writer, this.CONTIG_PATTERN_TO_IGNORE);
	if (this.INTERVALS!=null) processByPartialContig(ref, writer, this.INTERVALS);

	CloserUtil.close(ref);
	CloserUtil.close(writer);
	return 0;

}
 
Example #3
Source File: GtcToVcf.java    From picard with MIT License 6 votes vote down vote up
@Override
protected String[] customCommandLineValidation() {

    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(EXTENDED_ILLUMINA_MANIFEST);
    IOUtil.assertFileIsReadable(ILLUMINA_BEAD_POOL_MANIFEST_FILE);
    IOUtil.assertFileIsWritable(OUTPUT);
    refSeq = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
    final SAMSequenceDictionary sequenceDictionary = refSeq.getSequenceDictionary();
    final String assembly = sequenceDictionary.getSequence(0).getAssembly();
    if (!assembly.equals("GRCh37")) {
        return new String[]{"The selected reference sequence ('" + assembly + "') is not supported.  This tool is currently only implemented to support NCBI Build 37 / HG19 Reference Sequence."};
    }

    if (FINGERPRINT_GENOTYPES_VCF_FILE != null) {
        IOUtil.assertFileIsReadable(FINGERPRINT_GENOTYPES_VCF_FILE);
    }
    if (GENDER_GTC != null) {
        IOUtil.assertFileIsReadable(GENDER_GTC);
    }

    return super.customCommandLineValidation();
}
 
Example #4
Source File: GcBiasUtils.java    From picard with MIT License 6 votes vote down vote up
public static int[] calculateRefWindowsByGc(final int windows, final File referenceSequence, final int windowSize) {
    final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceSequence);
    ReferenceSequence ref;

    final int [] windowsByGc = new int [windows];

    while ((ref = refFile.nextSequence()) != null) {
        final byte[] refBases = ref.getBases();
        StringUtil.toUpperCase(refBases);
        final int refLength = refBases.length;
        final int lastWindowStart = refLength - windowSize;

        final CalculateGcState state = new GcBiasUtils().new CalculateGcState();

        for (int i = 1; i < lastWindowStart; ++i) {
            final int windowEnd = i + windowSize;
            final int gcBin = calculateGc(refBases, i, windowEnd, state);
            if (gcBin != -1) windowsByGc[gcBin]++;
        }
    }

    return windowsByGc;
}
 
Example #5
Source File: GatkToolIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test (expectedExceptions = java.lang.IllegalArgumentException.class)
// test asserting that if the reference dictionary exists but is not valid we get a more helpful exception than a null pointer exception
public void testBrokenReferenceDictionaryErrorMessage() throws IOException {
    File out = createTempFile("GTStrippedOutput", "vcf");

    Path refCopy = Files.copy(IOUtils.getPath(hg19_chr1_1M_Reference), createTempPath("reference", ".fasta"), StandardCopyOption.REPLACE_EXISTING);
    Path indexCopy = Files.copy(ReferenceSequenceFileFactory.getFastaIndexFileName(IOUtils.getPath(hg19_chr1_1M_Reference)), ReferenceSequenceFileFactory.getFastaIndexFileName(refCopy));
    File emptyDict = new File(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(refCopy).toString());
    IOUtils.deleteOnExit(indexCopy);

    emptyDict.createNewFile();
    IOUtils.deleteOnExit(emptyDict.toPath());

    String[] args = new String[] {
            "-R", refCopy.toString(),
            "-I", TEST_DIRECTORY + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10000000-10000020.with.unmapped.bam",
            "-O", out.getAbsolutePath()
    };

    runCommandLine(Arrays.asList(args), Mutect2.class.getSimpleName());
}
 
Example #6
Source File: GATKSparkTool.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Register the reference file (and associated dictionary and index) to be downloaded to every node using Spark's
 * copying mechanism ({@code SparkContext#addFile()}).
 * @param ctx the Spark context
 * @param referencePath the reference file, can be a local file or a remote path
 * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()}
 */
protected static String addReferenceFilesForSpark(JavaSparkContext ctx, Path referencePath) {
    if (referencePath == null) {
        return null;
    }
    Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(referencePath);
    Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referencePath);
    Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(referencePath);

    ctx.addFile(referencePath.toUri().toString());
    if (Files.exists(indexPath)) {
        ctx.addFile(indexPath.toUri().toString());
    }
    if (Files.exists(dictPath)) {
        ctx.addFile(dictPath.toUri().toString());
    }
    if (Files.exists(gziPath)) {
        ctx.addFile(gziPath.toUri().toString());
    }

    return referencePath.getFileName().toString();
}
 
Example #7
Source File: ReferenceFileSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public ReferenceBases getReferenceBases(final SimpleInterval interval) throws IOException {
    try ( ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(getReferencePath()) ) {
        ReferenceSequence sequence = referenceSequenceFile.getSubsequenceAt(interval.getContig(), interval.getStart(), interval.getEnd());
        return new ReferenceBases(sequence.getBases(), interval);
    }
}
 
Example #8
Source File: Utils.java    From cramtools with Apache License 2.0 5 votes vote down vote up
public static byte[] getBasesFromReferenceFile(String referenceFilePath, String seqName, int from, int length) {
	ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(
			referenceFilePath));
	ReferenceSequence sequence = referenceSequenceFile.getSequence(seqName);
	byte[] bases = referenceSequenceFile.getSubsequenceAt(sequence.getName(), from, from + length).getBases();
	return bases;
}
 
Example #9
Source File: ReferenceSource.java    From cramtools with Apache License 2.0 5 votes vote down vote up
public ReferenceSource(File file) {
	if (file != null) {
		rsFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(file);

		File indexFile = new File(file.getAbsoluteFile() + ".fai");
		if (indexFile.exists())
			fastaSequenceIndex = new FastaSequenceIndex(indexFile);
	}
}
 
Example #10
Source File: CreateSequenceDictionary.java    From varsim with BSD 2-Clause "Simplified" License 5 votes vote down vote up
public CreateSequenceDictionary(String FastaSequenceFileName) {
  try {
    md5 = MessageDigest.getInstance("MD5");
  } catch (NoSuchAlgorithmException e) {
    throw new RuntimeException("MD5 algorithm not found", e);
  }
  REFERENCE_SEQUENCE = new File(FastaSequenceFileName);
  URI = "file:" + REFERENCE_SEQUENCE.getAbsolutePath();
  OUTPUT = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(REFERENCE_SEQUENCE);
  logger.info("Output dictionary will be written in ", OUTPUT);
  doWork();
}
 
Example #11
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "fastas")
public void testCachingIndexedFastaReaderTwoStage(Path fasta, Path unzipped, int cacheSize, int querySize) throws IOException {
    try(final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.debug(String.format(
                "Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for (int i = 0; i < contig.getSequenceLength(); i += 10) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart,
                                                                        middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }
}
 
Example #12
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private void testSequential(final CachingIndexedFastaSequenceFile caching, final Path unzipped, final int querySize) throws IOException {
    Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");

    //use an unzipped version of the fasta because it's slow to repeatedly query the zipped version without caching
    try( final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
        for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }

        // asserts for efficiency.  We are going to make contig.length / STEP_SIZE queries
        // at each of range: start -> start + querySize against a cache with size of X.
        // we expect to hit the cache each time range falls within X.  We expect a hit
        // on the cache if range is within X.  Which should happen at least (X - query_size * 2) / STEP_SIZE
        // times.
        final int minExpectedHits = (int) Math.floor(
                (Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
        caching.printEfficiency(Level.DEBUG);
        Assert.assertTrue(caching.getCacheHits() >= minExpectedHits,
                          "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());
    }

}
 
Example #13
Source File: CachingIndexedFastaSequenceFile.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Performing several preliminary checks on the file path.
 *
 * @param fastaPath Fasta file to be used as reference
 * @throws UserException If the given {@code fastaPath} is not good.
 */
private static void checkFastaPath(final Path fastaPath) {

    // does the fasta file exist? check that first...
    if (!Files.exists(fastaPath)) {
        throw new UserException.MissingReference("The specified fasta file (" + fastaPath.toUri() + ") does not exist.");
    }

    //this is the .fai index, not the .gzi
    final Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaPath);

    // determine the name for the dict file
    final Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(fastaPath);

    // It's an error if either the fai or dict file does not exist. The user is now responsible
    // for creating these files.
    if (!Files.exists(indexPath)) {
        throw new UserException.MissingReferenceFaiFile(indexPath, fastaPath);
    }
    if (!Files.exists(dictPath)) {
        throw new UserException.MissingReferenceDictFile(dictPath, fastaPath);
    }

    //Block-compressed fastas additionally require a gzi index
    try {
        final Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(fastaPath);
        if( IOUtil.isBlockCompressed(fastaPath, true) && !Files.exists(gziPath)){
            throw new UserException.MissingReferenceGziFile(gziPath, fastaPath);
        }
    } catch (IOException e) {
        throw new UserException.CouldNotReadInputFile("Couldn't open fasta file: " + fastaPath.toUri().toString() +  ".", e);
    }
}
 
Example #14
Source File: ReferenceFileSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public Map<String, ReferenceBases> getAllReferenceBases() throws IOException {
    try ( ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(getReferencePath()) ) {
        Map<String, ReferenceBases> bases = new LinkedHashMap<>();
        ReferenceSequence seq;
        while ( (seq = referenceSequenceFile.nextSequence()) != null ) {
            String name = seq.getName();
            bases.put(name, new ReferenceBases(seq.getBases(), new SimpleInterval(name, 1, seq.length())));
        }
        return bases;
    }
}
 
Example #15
Source File: ValidateReference.java    From Drop-seq with MIT License 5 votes vote down vote up
private SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final ReferenceSequenceFile refSeqFile =
            ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, true);
    ReferenceSequence refSeq;
    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    while ((refSeq = refSeqFile.nextSequence()) != null) {
        if (sequenceNames.contains(refSeq.getName())) {
            throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName());
        }
        sequenceNames.add(refSeq.getName());
        ret.add(new SAMSequenceRecord(refSeq.getName(), refSeq.length()));
    }
    return new SAMSequenceDictionary(ret);
}
 
Example #16
Source File: ExtractSequences.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INTERVAL_LIST);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsWritable(OUTPUT);

    final IntervalList intervals = IntervalList.fromFile(INTERVAL_LIST);
    final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
    SequenceUtil.assertSequenceDictionariesEqual(intervals.getHeader().getSequenceDictionary(), ref.getSequenceDictionary());

    final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);

    for (final Interval interval : intervals) {
        final ReferenceSequence seq = ref.getSubsequenceAt(interval.getContig(), interval.getStart(), interval.getEnd());
        final byte[] bases = seq.getBases();
        if (interval.isNegativeStrand()) SequenceUtil.reverseComplement(bases);

        try {
            out.write(">");
            out.write(interval.getName());
            out.write("\n");

            for (int i=0; i<bases.length; ++i) {
                if (i > 0 && i % LINE_LENGTH == 0) out.write("\n");
                out.write(bases[i]);
            }

            out.write("\n");
        }
        catch (IOException ioe) {
            throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);

        }
    }

    CloserUtil.close(out);

    return 0;
}
 
Example #17
Source File: ScatterIntervalsByNs.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsWritable(OUTPUT);

    final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE, true);
    if (!refFile.isIndexed()) {
        throw new IllegalStateException("Reference file must be indexed, but no index file was found");
    }
    if (refFile.getSequenceDictionary() == null) {
        throw new IllegalStateException("Reference file must include a dictionary, but no dictionary file was found");
    }

    // get the intervals
    final IntervalList intervals = segregateReference(refFile, MAX_TO_MERGE);

    log.info(String.format("Found %d intervals in %d loci during %s seconds", intervalProgress.getCount(), locusProgress.getCount(), locusProgress.getElapsedSeconds()));

    /**********************************
     * Now output regions for calling *
     **********************************/

    final IntervalList outputIntervals = new IntervalList(intervals.getHeader().clone());
    log.info(String.format("Collecting requested type of intervals (%s)", OUTPUT_TYPE));

    intervals.getIntervals().stream().filter(i -> OUTPUT_TYPE.accepts(i.getName())).forEach(outputIntervals::add);

    log.info("Writing Intervals.");
    outputIntervals.write(OUTPUT);

    log.info(String.format("Execution ending. Total time %d seconds", locusProgress.getElapsedSeconds()));

    return 0;
}
 
Example #18
Source File: SequenceDictionaryUtils.java    From picard with MIT License 5 votes vote down vote up
public SamSequenceRecordsIterator(File referenceSequence, boolean truncateNamesAtWhitespace) {
    this.truncateNamesAtWhitespace = truncateNamesAtWhitespace;
    this.refSeqFile = ReferenceSequenceFileFactory.
            getReferenceSequenceFile(referenceSequence, truncateNamesAtWhitespace);

    this.nextRefSeq = refSeqFile.nextSequence();
    try {
        md5 = MessageDigest.getInstance("MD5");
    } catch (NoSuchAlgorithmException e) {
        throw new PicardException("MD5 algorithm not found", e);
    }
}
 
Example #19
Source File: FingerprintUtils.java    From picard with MIT License 5 votes vote down vote up
/**
 * A function that takes a Fingerprint and writes it as a VCF to a file
 *
 * @param fingerprint               the fingerprint to write
 * @param outputFile                the file to write to
 * @param referenceSequenceFileName the reference sequence (file)
 * @param sample                    the sample name to use in the vcf
 * @param source                    a "source" comment to use in the VCF
 * @throws IOException
 */
public static void writeFingerPrint(final Fingerprint fingerprint,
                                    final File outputFile,
                                    final File referenceSequenceFileName,
                                    final String sample,
                                    final String source) throws IOException {

    try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceSequenceFileName);
         final VariantContextWriter variantContextWriter = getVariantContextWriter(outputFile, referenceSequenceFileName, sample, source, ref)) {

        createVCSetFromFingerprint(fingerprint, ref, sample).forEach(variantContextWriter::add);
    }
}
 
Example #20
Source File: CreateSequenceDictionary.java    From picard with MIT License 5 votes vote down vote up
/**
 * Use reference filename to create URI to go into header if URI was not passed on cmd line.
 */
protected String[] customCommandLineValidation() {
    if (URI == null) {
        URI = "file:" + referenceSequence.getReferenceFile().getAbsolutePath();
    }
    if (OUTPUT == null) {
        OUTPUT = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referenceSequence.getReferenceFile());
        logger.info("Output dictionary will be written in ", OUTPUT);
    }
    return super.customCommandLineValidation();
}
 
Example #21
Source File: ValidateReference.java    From Drop-seq with MIT License 5 votes vote down vote up
private void validateReferenceBases(File referenceFile) {
    final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, true);
    ReferenceSequence sequence;
    while ((sequence = refSeqFile.nextSequence()) != null) {
        for (final byte base: sequence.getBases()) {
            if (!IUPAC_TABLE[base]) {
                messages.baseErrors = String.format("WARNING: AT least one invalid base '%c' (decimal %d) in reference sequence named %s",
                        StringUtil.byteToChar(base), base, sequence.getName());
                break;
            }
        }
    }
}
 
Example #22
Source File: NonNFastaSize.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    // set up the reference and a mask so that we only count the positions requested by the user
    final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(INPUT);
    final ReferenceSequenceMask referenceSequenceMask;
    if (INTERVALS != null) {
        IOUtil.assertFileIsReadable(INTERVALS);
        final IntervalList intervalList = IntervalList.fromFile(INTERVALS);
        referenceSequenceMask = new IntervalListReferenceSequenceMask(intervalList);
    } else {
        final SAMFileHeader header = new SAMFileHeader();
        header.setSequenceDictionary(ref.getSequenceDictionary());
        referenceSequenceMask = new WholeGenomeReferenceSequenceMask(header);
    }

    long nonNbases = 0L;

    for (final SAMSequenceRecord rec : ref.getSequenceDictionary().getSequences()) {
        // pull out the contig and set up the bases
        final ReferenceSequence sequence = ref.getSequence(rec.getSequenceName());
        final byte[] bases = sequence.getBases();
        StringUtil.toUpperCase(bases);

        for (int i = 0; i < bases.length; i++) {
            // only investigate this position if it's within our mask
            if (referenceSequenceMask.get(sequence.getContigIndex(), i+1)) {
                nonNbases += bases[i] == SequenceUtil.N ? 0 : 1;
            }
        }
    }

    try {
        final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);
        out.write(nonNbases + "\n");
        out.close();
    }
    catch (IOException ioe) {
        throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);
    }

    return 0;
}
 
Example #23
Source File: ReferenceFileSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public SAMSequenceDictionary getReferenceSequenceDictionary(final SAMSequenceDictionary optReadSequenceDictionaryToMatch) throws IOException {
    try ( ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(getReferencePath()) ) {
        return referenceSequenceFile.getSequenceDictionary();
    }
}
 
Example #24
Source File: ReferenceHadoopSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public ReferenceBases getReferenceBases(final SimpleInterval interval) {
    ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new GATKPath(referenceURI.toString()).toPath());
    ReferenceSequence sequence = referenceSequenceFile.getSubsequenceAt(interval.getContig(), interval.getStart(), interval.getEnd());
    return new ReferenceBases(sequence.getBases(), interval);
}
 
Example #25
Source File: ReferenceHadoopSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public SAMSequenceDictionary getReferenceSequenceDictionary(final SAMSequenceDictionary optReadSequenceDictionaryToMatch) {
    ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new GATKPath(referenceURI.toString()).toPath());
    return referenceSequenceFile.getSequenceDictionary();
}
 
Example #26
Source File: NormalizeFasta.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    if (INPUT.getAbsoluteFile().equals(OUTPUT.getAbsoluteFile())) {
        throw new IllegalArgumentException("Input and output cannot be the same file.");
    }

    final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(INPUT, TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE);
    final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);

    ReferenceSequence seq = null;
    while ((seq = ref.nextSequence()) != null) {
        final String name  = seq.getName();
        final byte[] bases = seq.getBases();

        try {
            out.write(">");
            out.write(name);
            out.newLine();

            if (bases.length == 0) {
                log.warn("Sequence " + name + " contains 0 bases.");
            }
            else {
                for (int i=0; i<bases.length; ++i) {
                    if (i > 0 && i % LINE_LENGTH == 0) out.write("\n");
                    out.write(bases[i]);
                }

                out.write("\n");
            }
        }
        catch (IOException ioe) {
            throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);

        }
    }
    try {
        out.close();
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    return 0;
}
 
Example #27
Source File: CollectTargetedMetrics.java    From picard with MIT License 4 votes vote down vote up
/**
 * Asserts that files are readable and writable and then fires off an
 * HsMetricsCalculator instance to do the real work.
 */
protected int doWork() {
    for (final File targetInterval : TARGET_INTERVALS) IOUtil.assertFileIsReadable(targetInterval);
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    if (PER_TARGET_COVERAGE != null) IOUtil.assertFileIsWritable(PER_TARGET_COVERAGE);

    final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
    final IntervalList targetIntervals = IntervalList.fromFiles(TARGET_INTERVALS);

    // Validate that the targets and baits have the same references as the reads file
    SequenceUtil.assertSequenceDictionariesEqual(
            reader.getFileHeader().getSequenceDictionary(),
            targetIntervals.getHeader().getSequenceDictionary());
    SequenceUtil.assertSequenceDictionariesEqual(
            reader.getFileHeader().getSequenceDictionary(),
            getProbeIntervals().getHeader().getSequenceDictionary()
    );

    ReferenceSequenceFile ref = null;
    if (REFERENCE_SEQUENCE != null) {
        IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
        ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
        SequenceUtil.assertSequenceDictionariesEqual(
                reader.getFileHeader().getSequenceDictionary(), ref.getSequenceDictionary(),
                INPUT, REFERENCE_SEQUENCE
        );
    }

    final COLLECTOR collector = makeCollector(
            METRIC_ACCUMULATION_LEVEL,
            reader.getFileHeader().getReadGroups(),
            ref,
            PER_TARGET_COVERAGE,
            PER_BASE_COVERAGE,
            targetIntervals,
            getProbeIntervals(),
            getProbeSetName(),
            NEAR_DISTANCE
    );

    final ProgressLogger progress = new ProgressLogger(log);
    for (final SAMRecord record : reader) {
        collector.acceptRecord(record, null);
        progress.record(record);
    }

    // Write the output file
    final MetricsFile<METRIC, Integer> metrics = getMetricsFile();
    collector.finish();

    collector.addAllLevelsToFile(metrics);

    metrics.write(OUTPUT);

    if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
        // Write out theoretical sensitivity results.
        final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
        log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
        List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION);
        theoreticalSensitivityMetrics.addAllMetrics(tsm);
        theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
    }

    CloserUtil.close(reader);
    return 0;
}
 
Example #28
Source File: ValidateCramFile.java    From cramtools with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException, IllegalArgumentException, IllegalAccessException {
	Params params = new Params();
	JCommander jc = new JCommander(params);
	try {
		jc.parse(args);
	} catch (Exception e) {
		System.out.println("Failed to parse parameteres, detailed message below: ");
		System.out.println(e.getMessage());
		System.out.println();
		System.out.println("See usage: -h");
		System.exit(1);
	}

	if (args.length == 0 || params.help) {
		printUsage(jc);
		System.exit(1);
	}

	if (params.reference == null) {
		System.out.println("A reference fasta file is required.");
		System.exit(1);
	}

	if (params.cramFile == null) {
		System.out.println("A CRAM input file is required. ");
		System.exit(1);
	}

	Log.setGlobalLogLevel(Log.LogLevel.INFO);

	ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory
			.getReferenceSequenceFile(params.reference);

	FileInputStream fis = new FileInputStream(params.cramFile);
	BufferedInputStream bis = new BufferedInputStream(fis);

	CRAMIterator iterator = new CRAMIterator(bis, new ReferenceSource(params.reference),
			ValidationStringency.STRICT);
	CramHeader cramHeader = iterator.getCramHeader();

	iterator.close();

	ProgressLogger progress = new ProgressLogger(log, 100000, "Validated Read");
	SamFileValidator v = new SamFileValidator(new PrintWriter(System.out), 1);
	final SamReader reader = SamReaderFactory.make().referenceSequence(params.reference).open(params.cramFile);
	List<SAMValidationError.Type> errors = new ArrayList<SAMValidationError.Type>();
	errors.add(SAMValidationError.Type.MATE_NOT_FOUND);
	// errors.add(Type.MISSING_TAG_NM);
	v.setErrorsToIgnore(errors);
	v.validateSamFileSummary(reader, ReferenceSequenceFileFactory.getReferenceSequenceFile(params.reference));
	log.info("Elapsed seconds: " + progress.getElapsedSeconds());
}
 
Example #29
Source File: GatherGeneGCLengthTest.java    From Drop-seq with MIT License 4 votes vote down vote up
@Test
public void testBasic() throws IOException {
	final GatherGeneGCLength clp = new GatherGeneGCLength();
       final File outputFile = File.createTempFile("GatherGeneGCLengthTest.", ".gc_length_metrics");
       final File outputTranscriptSequencesFile = File.createTempFile("GatherGeneGCLengthTest.", ".transcript_sequences.fasta");
       final File outputTranscriptLevelFile = File.createTempFile("GatherGeneGCLengthTest.", ".transcript_level");
       outputFile.deleteOnExit();
       outputTranscriptSequencesFile.deleteOnExit();
       outputTranscriptLevelFile.deleteOnExit();
       final String[] args = new String[] {
               "ANNOTATIONS_FILE=" + GTF.getAbsolutePath(),
               "REFERENCE_SEQUENCE=" + FASTA.getAbsolutePath(),
               "OUTPUT=" + outputFile.getAbsolutePath(),
               "OUTPUT_TRANSCRIPT_SEQUENCES=" + outputTranscriptSequencesFile.getAbsolutePath(),
               "OUTPUT_TRANSCRIPT_LEVEL=" + outputTranscriptLevelFile.getAbsolutePath()
       };
       Assert.assertEquals(clp.instanceMain(args), 0);
       final ReferenceSequence transcriptSequence = ReferenceSequenceFileFactory.getReferenceSequenceFile(outputTranscriptSequencesFile, false).nextSequence();
       final ReferenceSequence genomicSequence = ReferenceSequenceFileFactory.getReferenceSequenceFile(FASTA).nextSequence();
       // For ERCC, the first sequence is the first gene, which is a single exon
       final String geneName = "ERCC-00002";
       final String transcriptName = "DQ459430";
       Assert.assertEquals(transcriptSequence.getBaseString(), genomicSequence.getBaseString());
       Assert.assertEquals(transcriptSequence.getName(), geneName + " " + transcriptName);
       int numG = 0;
       int numC = 0;
       for (byte b : transcriptSequence.getBases())
		if (b == SequenceUtil.G) ++numG;
       	else if (b == SequenceUtil.C) ++numC;
	final double pctG = 100 * numG/(double)transcriptSequence.getBases().length;
       final double pctC = 100 * numC/(double)transcriptSequence.getBases().length;
       final double pctGC = 100 * (numC + numG)/(double)transcriptSequence.getBases().length;
       TabbedTextFileWithHeaderParser.Row metric = new TabbedTextFileWithHeaderParser(outputFile).iterator().next();
       Assert.assertEquals(metric.getField("GENE"), geneName);
       Assert.assertEquals(Integer.parseInt(metric.getField("START")), 1);
       Assert.assertEquals(Integer.parseInt(metric.getField("END")), transcriptSequence.getBases().length);
       Assert.assertEquals(Double.parseDouble(metric.getField("PCT_GC_UNIQUE_EXON_BASES")), pctGC, 0.05);
       Assert.assertEquals(Double.parseDouble(metric.getField("PCT_C_ISOFORM_AVERAGE")), pctC, 0.05);
       Assert.assertEquals(Double.parseDouble(metric.getField("PCT_G_ISOFORM_AVERAGE")), pctG, 0.05);
       Assert.assertEquals(Integer.parseInt(metric.getField("NUM_TRANSCRIPTS")), 1);
       TabbedTextFileWithHeaderParser.Row transcriptLevel = new TabbedTextFileWithHeaderParser(outputTranscriptLevelFile).iterator().next();
       Assert.assertEquals(transcriptLevel.getField("TRANSCRIPT"), transcriptName);
       Assert.assertEquals(transcriptLevel.getField("CHR"), genomicSequence.getName());
       Assert.assertEquals(Integer.parseInt(transcriptLevel.getField("START")), 1);
       Assert.assertEquals(Integer.parseInt(transcriptLevel.getField("END")), transcriptSequence.getBases().length);
       Assert.assertEquals(Double.parseDouble(transcriptLevel.getField("PCT_GC")), pctGC, 0.05);
       Assert.assertEquals(Double.parseDouble(transcriptLevel.getField("PCT_C")), pctC, 0.05);
       Assert.assertEquals(Double.parseDouble(transcriptLevel.getField("PCT_G")), pctG, 0.05);
}