htsjdk.samtools.reference.ReferenceSequenceFile Java Examples

The following examples show how to use htsjdk.samtools.reference.ReferenceSequenceFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TargetedPcrMetricsCollector.java    From picard with MIT License 6 votes vote down vote up
public TargetedPcrMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels,
                                   final List<SAMReadGroupRecord> samRgRecords,
                                   final ReferenceSequenceFile refFile,
                                   final File perTargetCoverage,
                                   final File perBaseCoverage,
                                   final IntervalList targetIntervals,
                                   final IntervalList probeIntervals,
                                   final String probeSetName,
                                   final int nearProbeDistance,
                                   final int minimumMappingQuality,
                                   final int minimumBaseQuality,
                                   final boolean clipOverlappingReads,
                                   final boolean noSideEffects,
                                   final boolean includeIndels,
                                   final int coverageCap,
                                   final int sampleSize) {
    super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, noSideEffects, includeIndels, coverageCap, sampleSize);
}
 
Example #2
Source File: OverhangFixingManager.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 *
 * @param header                   header for the reads
 * @param writer                   actual writer
 * @param genomeLocParser          the GenomeLocParser object
 * @param referenceReader          the reference reader
 * @param maxRecordsInMemory       max records to keep in memory
 * @param maxMismatchesInOverhangs max number of mismatches permitted in the overhangs before requiring clipping
 * @param maxBasesInOverhangs      max number of bases permitted in the overhangs before deciding not to clip
 * @param doNotFixOverhangs        if true, don't clip overhangs at all
 * @param processSecondaryReads    if true, allow secondary reads to be overhang clipped. If false, secondary reads
 *                                 will still be written out to the writer but will not be edited by the clipper.
 */
public OverhangFixingManager(final SAMFileHeader header,
                             final GATKReadWriter writer,
                             final GenomeLocParser genomeLocParser,
                             final ReferenceSequenceFile referenceReader,
                             final int maxRecordsInMemory,
                             final int maxMismatchesInOverhangs,
                             final int maxBasesInOverhangs,
                             final boolean doNotFixOverhangs,
                             final boolean processSecondaryReads) {
    this.header = header;
    this.writer = writer;
    this.genomeLocParser = genomeLocParser;
    this.referenceReader = referenceReader;
    this.maxRecordsInMemory = maxRecordsInMemory;
    this.maxMismatchesInOverhang = maxMismatchesInOverhangs;
    this.maxBasesInOverhang = maxBasesInOverhangs;
    this.doNotFixOverhangs = doNotFixOverhangs;
    this.waitingReadGroups = new PriorityQueue<List<SplitRead>>(INITIAL_CAPACITY, new SplitReadComparator());
    this.outputToFile = false;
    this.mateChangedReads = new HashMap<>();
    this.processSecondaryReads = processSecondaryReads;
}
 
Example #3
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 6 votes vote down vote up
private void processByPartialContig (final ReferenceSequenceFile ref, final FastaSequenceFileWriter writer, final File intervalListFile) {
	SAMSequenceDictionary sd = ref.getSequenceDictionary();
	// validate that the intervals and the reference have the same sequence dictionary.
	IntervalList iList = IntervalList.fromFile(intervalListFile);
	iList.getHeader().getSequenceDictionary().assertSameDictionary(sd);
	// map the intervals to a map to each contig.
	Map<String, List<Interval>> intervalsPerContig = getIntervalsForContig(iList);

	for (SAMSequenceRecord r: sd.getSequences()) {
		String contig = r.getSequenceName();
		log.info("Processing partial contig " + contig);
		// this list can be null.
		List<Interval> intervalsToMask = intervalsPerContig.get(contig);
		ReferenceSequence rs = ref.getSequence(contig);
		writeSequence(rs, intervalsToMask, writer);
	}
}
 
Example #4
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 6 votes vote down vote up
@Override
protected int doWork() {
	IOUtil.assertFileIsReadable(this.REFERENCE_SEQUENCE);
	IOUtil.assertFileIsWritable(this.OUTPUT);
	// validate that an index is present for the reference sequence, since it's required.
	final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE, true, true);
	if (!ref.isIndexed())
		throw new IllegalStateException ("Input fasta must be indexed.  You can do this by using samtools faidx to create an index");

	FastaSequenceFileWriter writer = new FastaSequenceFileWriter(OUTPUT, OUTPUT_LINE_LENGTH);
	if (this.CONTIG_PATTERN_TO_IGNORE!=null && !this.CONTIG_PATTERN_TO_IGNORE.isEmpty()) processByWholeContig(ref, writer, this.CONTIG_PATTERN_TO_IGNORE);
	if (this.INTERVALS!=null) processByPartialContig(ref, writer, this.INTERVALS);

	CloserUtil.close(ref);
	CloserUtil.close(writer);
	return 0;

}
 
Example #5
Source File: RefRepo.java    From cramtools with Apache License 2.0 6 votes vote down vote up
@Override
public List<Entry> call() throws Exception {
	List<Entry> list = new ArrayList<RefRepo.Entry>();

	ReferenceSequenceFile rsFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(file);

	ReferenceSequence sequence = null;
	while ((sequence = rsFile.nextSequence()) != null) {
		sequence.getBases();

		Entry e = new Entry();
		e.md5 = Utils.calculateMD5String(sequence.getBases());
		e.file = "file://" + file.getAbsolutePath();
		e.name = sequence.getName();
		e.length = sequence.length();
		log.info(String.format("New entry: %s", e.toString()));
		list.add(e);
	}
	return list;
}
 
Example #6
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private int mixedCasesTestHelper(final ReferenceSequenceFile original,
                                 final ReferenceSequenceFile casePreserving,
                                 final ReferenceSequenceFile allUpper,
                                 final String contig, final int start, final int stop ) {
    final String orig = fetchBaseString(original, contig, start, stop);
    final String keptCase = fetchBaseString(casePreserving, contig, start, stop);
    final String upperCase = fetchBaseString(allUpper, contig, start, stop);

    final String origToUpper = orig.toUpperCase();
    if ( ! orig.equals(origToUpper) ) {
        Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig);
        Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig);
        return 1;
    } else {
        return 0;
    }
}
 
Example #7
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext> assemblyFunction(final SAMFileHeader header,
                                                                                                       final String referenceFileName,
                                                                                                       final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
                                                                                                       final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast,
                                                                                                       final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
    return (FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext>) contexts -> {
        // HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), assemblyRegionArgsBroadcast.value(), false, false, header, taskReferenceSequenceFile, annotatorEngineBroadcast.getValue());
        Iterator<Iterator<VariantContext>> iterators = Utils.stream(contexts).map(context -> {
            AssemblyRegion region = context.getAssemblyRegion();
            FeatureContext featureContext = context.getFeatureContext();
            return hcEngine.callRegion(region, featureContext, context.getReferenceContext()).iterator();
        }).iterator();

        return Iterators.concat(iterators);
    };
}
 
Example #8
Source File: HaplotypeCallerEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create and initialize a new HaplotypeCallerEngine given a collection of HaplotypeCaller arguments, a reads header,
 * and a reference file
 *  @param hcArgs command-line arguments for the HaplotypeCaller
 * @param assemblyRegionArgs
 * @param createBamOutIndex true to create an index file for the bamout
 * @param createBamOutMD5 true to create an md5 file for the bamout
 * @param readsHeader header for the reads
 * @param referenceReader reader to provide reference data, this reference reader will be closed when {@link #shutdown()} is called
 * @param annotationEngine variantAnnotatorEngine with annotations to process already added
 */
public HaplotypeCallerEngine(final HaplotypeCallerArgumentCollection hcArgs, AssemblyRegionArgumentCollection assemblyRegionArgs, boolean createBamOutIndex,
                             boolean createBamOutMD5, final SAMFileHeader readsHeader,
                             ReferenceSequenceFile referenceReader, VariantAnnotatorEngine annotationEngine) {
    this.hcArgs = Utils.nonNull(hcArgs);
    this.readsHeader = Utils.nonNull(readsHeader);
    this.referenceReader = Utils.nonNull(referenceReader);
    this.annotationEngine = Utils.nonNull(annotationEngine);
    this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation);
    trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, readsHeader.getSequenceDictionary());
    forceCallingAllelesPresent = hcArgs.alleles != null;
    initialize(createBamOutIndex, createBamOutMD5);
    if (hcArgs.assemblyStateOutput != null) {
        try {
            assemblyDebugOutStream = new PrintStream(Files.newOutputStream(IOUtils.getPath(hcArgs.assemblyStateOutput)));
        } catch (IOException e) {
            throw new UserException.CouldNotCreateOutputFile(hcArgs.assemblyStateOutput, "Provided argument for assembly debug graph location could not be created");
        }
    } else {
        assemblyDebugOutStream = null;
    }
}
 
Example #9
Source File: FingerprintUtils.java    From picard with MIT License 6 votes vote down vote up
private static VariantContextWriter getVariantContextWriter(final File outputFile,
                                                            final File referenceSequenceFileName,
                                                            final String sample,
                                                            final String source,
                                                            final ReferenceSequenceFile ref) {
    final VariantContextWriter variantContextWriter = new VariantContextWriterBuilder()
            .setReferenceDictionary(ref.getSequenceDictionary())
            .setOutputFile(outputFile).build();

    final Set<VCFHeaderLine> lines = new LinkedHashSet<>();
    lines.add(new VCFHeaderLine("reference", referenceSequenceFileName.getAbsolutePath()));
    lines.add(new VCFHeaderLine("source", source));
    lines.add(new VCFHeaderLine("fileDate", new Date().toString()));

    lines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_PL_KEY));
    lines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS));
    lines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));

    final VCFHeader header = new VCFHeader(lines, Collections.singletonList(sample));
    header.setSequenceDictionary(ref.getSequenceDictionary());
    variantContextWriter.writeHeader(header);
    return variantContextWriter;
}
 
Example #10
Source File: FingerprintUtils.java    From picard with MIT License 6 votes vote down vote up
/**
 * A utility function that takes a fingerprint and returns a VariantContextSet with variants representing the haplotypes in the fingerprint
 *
 * @param fingerPrint A fingerprint
 * @param reference   A reference sequence that will be used to create the VariantContexts
 * @param sample      A sample name that will be used for the genotype field
 * @return VariantContextSet with variants representing the haplotypes in the fingerprint
 */
public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fingerPrint, final ReferenceSequenceFile reference, final String sample) {

    final VariantContextSet variantContexts = new VariantContextSet(reference.getSequenceDictionary());

    // check that the same snp name isn't twice in the fingerprint.
    fingerPrint.values().stream()
            .map(hp -> hp.getRepresentativeSnp().getName())
            .filter(Objects::nonNull)
            .filter(n -> !n.equals(""))
            .collect(Collectors.groupingBy(Function.identity(), Collectors.counting()))
            .entrySet()
            .stream()
            .filter(e -> e.getValue() > 1)
            .findFirst()
            .ifPresent(e -> {
                throw new IllegalArgumentException("Found same SNP name twice (" + e.getKey() + ") in fingerprint. Cannot create a VCF.");
            });

    // convert all the haplotypes to variant contexts and add them to the set.
    fingerPrint.values().stream()
            .map(hp -> getVariantContext(reference, sample, hp))
            .forEach(variantContexts::add);

    return variantContexts;
}
 
Example #11
Source File: GcBiasUtils.java    From picard with MIT License 6 votes vote down vote up
public static int[] calculateRefWindowsByGc(final int windows, final File referenceSequence, final int windowSize) {
    final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceSequence);
    ReferenceSequence ref;

    final int [] windowsByGc = new int [windows];

    while ((ref = refFile.nextSequence()) != null) {
        final byte[] refBases = ref.getBases();
        StringUtil.toUpperCase(refBases);
        final int refLength = refBases.length;
        final int lastWindowStart = refLength - windowSize;

        final CalculateGcState state = new GcBiasUtils().new CalculateGcState();

        for (int i = 1; i < lastWindowStart; ++i) {
            final int windowEnd = i + windowSize;
            final int gcBin = calculateGc(refBases, i, windowEnd, state);
            if (gcBin != -1) windowsByGc[gcBin]++;
        }
    }

    return windowsByGc;
}
 
Example #12
Source File: HsMetricCollector.java    From picard with MIT License 6 votes vote down vote up
public HsMetricCollector(final Set<MetricAccumulationLevel> accumulationLevels,
                         final List<SAMReadGroupRecord> samRgRecords,
                         final ReferenceSequenceFile refFile,
                         final File perTargetCoverage,
                         final File perBaseCoverage,
                         final IntervalList targetIntervals,
                         final IntervalList probeIntervals,
                         final String probeSetName,
                         final int nearProbeDistance,
                         final int minimumMappingQuality,
                         final int minimumBaseQuality,
                         final boolean clipOverlappingReads,
                         final int coverageCap,
                         final int sampleSize) {
    super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, coverageCap, sampleSize);
}
 
Example #13
Source File: HsMetricCollector.java    From picard with MIT License 6 votes vote down vote up
public HsMetricCollector(final Set<MetricAccumulationLevel> accumulationLevels,
                         final List<SAMReadGroupRecord> samRgRecords,
                         final ReferenceSequenceFile refFile,
                         final File perTargetCoverage,
                         final File perBaseCoverage,
                         final IntervalList targetIntervals,
                         final IntervalList probeIntervals,
                         final String probeSetName,
                         final int nearProbeDistance,
                         final int minimumMappingQuality,
                         final int minimumBaseQuality,
                         final boolean clipOverlappingReads,
                         final boolean noSideEffects,
                         final boolean includeIndels,
                         final int coverageCap,
                         final int sampleSize) {
    super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, noSideEffects, includeIndels, coverageCap, sampleSize);
}
 
Example #14
Source File: TargetMetricsCollector.java    From picard with MIT License 6 votes vote down vote up
public TargetMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels,
                              final List<SAMReadGroupRecord> samRgRecords,
                              final ReferenceSequenceFile refFile,
                              final File perTargetCoverage,
                              final File perBaseCoverage,
                              final IntervalList targetIntervals,
                              final IntervalList probeIntervals,
                              final String probeSetName,
                              final int nearProbeDistance,
                              final int minimumMappingQuality,
                              final int minimumBaseQuality,
                              final boolean clipOverlappingReads,
                              final int coverageCap,
                              final int sampleSize) {
    this(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, false, false, coverageCap, sampleSize);
}
 
Example #15
Source File: TargetedPcrMetricsCollector.java    From picard with MIT License 6 votes vote down vote up
public TargetedPcrMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels,
                                   final List<SAMReadGroupRecord> samRgRecords,
                                   final ReferenceSequenceFile refFile,
                                   final File perTargetCoverage,
                                   final File perBaseCoverage,
                                   final IntervalList targetIntervals,
                                   final IntervalList probeIntervals,
                                   final String probeSetName,
                                   final int nearProbeDistance,
                                   final int minimumMappingQuality,
                                   final int minimumBaseQuality,
                                   final boolean clipOverlappingReads,
                                   final int coverageCap,
                                   final int sampleSize) {
    super(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance, minimumMappingQuality, minimumBaseQuality, clipOverlappingReads, coverageCap, sampleSize);
}
 
Example #16
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private String fetchBaseString(final ReferenceSequenceFile reader, final String contig, final int start, final int stop) {
    if ( start == -1 ) {
        return new String(reader.getSequence(contig).getBases());
    } else {
        return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
    }
}
 
Example #17
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast(
        final JavaSparkContext ctx,
        final HaplotypeCallerArgumentCollection hcArgs,
        AssemblyRegionArgumentCollection assemblyRegionArgs, final SAMFileHeader header,
        final String reference,
        final Collection<Annotation> annotations) {
    final Path referencePath = IOUtils.getPath(reference);
    final String referenceFileName = referencePath.getFileName().toString();
    final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
    final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
    return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, assemblyRegionArgs, header, taskReferenceSequenceFile, annotatorEngine);
}
 
Example #18
Source File: ValidateReference.java    From Drop-seq with MIT License 5 votes vote down vote up
private void validateReferenceBases(File referenceFile) {
    final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, true);
    ReferenceSequence sequence;
    while ((sequence = refSeqFile.nextSequence()) != null) {
        for (final byte base: sequence.getBases()) {
            if (!IUPAC_TABLE[base]) {
                messages.baseErrors = String.format("WARNING: AT least one invalid base '%c' (decimal %d) in reference sequence named %s",
                        StringUtil.byteToChar(base), base, sequence.getName());
                break;
            }
        }
    }
}
 
Example #19
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "fastas")
public void testCachingIndexedFastaReaderTwoStage(Path fasta, Path unzipped, int cacheSize, int querySize) throws IOException {
    try(final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.debug(String.format(
                "Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for (int i = 0; i < contig.getSequenceLength(); i += 10) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart,
                                                                        middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }
}
 
Example #20
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private void testSequential(final CachingIndexedFastaSequenceFile caching, final Path unzipped, final int querySize) throws IOException {
    Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");

    //use an unzipped version of the fasta because it's slow to repeatedly query the zipped version without caching
    try( final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
        for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }

        // asserts for efficiency.  We are going to make contig.length / STEP_SIZE queries
        // at each of range: start -> start + querySize against a cache with size of X.
        // we expect to hit the cache each time range falls within X.  We expect a hit
        // on the cache if range is within X.  Which should happen at least (X - query_size * 2) / STEP_SIZE
        // times.
        final int minExpectedHits = (int) Math.floor(
                (Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
        caching.printEfficiency(Level.DEBUG);
        Assert.assertTrue(caching.getCacheHits() >= minExpectedHits,
                          "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());
    }

}
 
Example #21
Source File: ActivityProfileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@BeforeClass
public void init() throws IOException {
    // sequence
    try(final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(IOUtils.getPath(b37_reference_20_21))) {
        genomeLocParser = new GenomeLocParser(seq);
        startLoc = genomeLocParser.createGenomeLoc("20", 0, 1, 100);
        header = new SAMFileHeader();
        seq.getSequenceDictionary().getSequences().forEach(sequence -> header.addSequence(sequence));
    }
}
 
Example #22
Source File: CachingIndexedFastaSequenceFile.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Assert that the fasta reader we opened is indexed.  It should be because we asserted that the indexes existed
 * in {@link #checkFastaPath(Path)}
 */
private static ReferenceSequenceFile requireIndex(Path fasta, ReferenceSequenceFile referenceSequenceFile) {
    if (!referenceSequenceFile.isIndexed()) {
        throw new GATKException("Could not load " + fasta.toUri().toString() + " as an indexed fasta despite passing checks before loading.");
    }
    return referenceSequenceFile;
}
 
Example #23
Source File: ReferenceFileSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public Map<String, ReferenceBases> getAllReferenceBases() throws IOException {
    try ( ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(getReferencePath()) ) {
        Map<String, ReferenceBases> bases = new LinkedHashMap<>();
        ReferenceSequence seq;
        while ( (seq = referenceSequenceFile.nextSequence()) != null ) {
            String name = seq.getName();
            bases.put(name, new ReferenceBases(seq.getBases(), new SimpleInterval(name, 1, seq.length())));
        }
        return bases;
    }
}
 
Example #24
Source File: ReferenceFileSparkSource.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public ReferenceBases getReferenceBases(final SimpleInterval interval) throws IOException {
    try ( ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(getReferencePath()) ) {
        ReferenceSequence sequence = referenceSequenceFile.getSubsequenceAt(interval.getContig(), interval.getStart(), interval.getEnd());
        return new ReferenceBases(sequence.getBases(), interval);
    }
}
 
Example #25
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static void processAssemblyRegions(
        JavaRDD<AssemblyRegionWalkerContext> rdd,
        final JavaSparkContext ctx,
        final SAMFileHeader header,
        final String reference,
        final HaplotypeCallerArgumentCollection hcArgs,
        final AssemblyRegionArgumentCollection assemblyRegionArgs,
        final String output,
        final Collection<Annotation> annotations,
        final Logger logger,
        final boolean createOutputVariantIndex) {

    final VariantAnnotatorEngine variantannotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);

    final Path referencePath = IOUtils.getPath(reference);
    final ReferenceSequenceFile driverReferenceSequenceFile = new CachingIndexedFastaSequenceFile(referencePath);
    final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgs, assemblyRegionArgs, false, false, header, driverReferenceSequenceFile, variantannotatorEngine);
    final String referenceFileName = referencePath.getFileName().toString();
    final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast = ctx.broadcast(hcArgs);
    final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast = ctx.broadcast(assemblyRegionArgs);
    final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast = ctx.broadcast(variantannotatorEngine);

    final JavaRDD<VariantContext> variants = rdd.mapPartitions(assemblyFunction(header, referenceFileName, hcArgsBroadcast, assemblyRegionArgsBroadcast, annotatorEngineBroadcast));

    try {
        VariantsSparkSink.writeVariants(ctx, output, variants, hcEngine.makeVCFHeader(header.getSequenceDictionary(), new HashSet<>()),
                hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF, new ArrayList<Number>(hcArgs.GVCFGQBands), hcArgs.standardArgs.genotypeArgs.samplePloidy,
                0, createOutputVariantIndex, false);
    } catch (IOException e) {
        throw new UserException.CouldNotCreateOutputFile(output, "writing failed", e);
    }
}
 
Example #26
Source File: WgsMetricsProcessorImplTest.java    From picard with MIT License 5 votes vote down vote up
@Test
public void testForFilteredBases(){
    AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead);
    CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics();
    FastWgsMetricsCollector collector =  new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList());
    String secondReferenceString = ">ref\nNNNNNNNNNNAATATTCTTC";
    ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(secondReferenceString);
    ReferenceSequenceFileWalker refWalker =  new ReferenceSequenceFileWalker(referenceSequenceFile);
    WgsMetricsProcessorImpl wgsMetricsProcessor =
            new WgsMetricsProcessorImpl(iterator, refWalker, collector, progress);
    wgsMetricsProcessor.processFile();
    assertEquals(10, collector.counter);
}
 
Example #27
Source File: ExtractSequences.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INTERVAL_LIST);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsWritable(OUTPUT);

    final IntervalList intervals = IntervalList.fromFile(INTERVAL_LIST);
    final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
    SequenceUtil.assertSequenceDictionariesEqual(intervals.getHeader().getSequenceDictionary(), ref.getSequenceDictionary());

    final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);

    for (final Interval interval : intervals) {
        final ReferenceSequence seq = ref.getSubsequenceAt(interval.getContig(), interval.getStart(), interval.getEnd());
        final byte[] bases = seq.getBases();
        if (interval.isNegativeStrand()) SequenceUtil.reverseComplement(bases);

        try {
            out.write(">");
            out.write(interval.getName());
            out.write("\n");

            for (int i=0; i<bases.length; ++i) {
                if (i > 0 && i % LINE_LENGTH == 0) out.write("\n");
                out.write(bases[i]);
            }

            out.write("\n");
        }
        catch (IOException ioe) {
            throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);

        }
    }

    CloserUtil.close(out);

    return 0;
}
 
Example #28
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcastFunction(
        final JavaSparkContext ctx,
        final HaplotypeCallerArgumentCollection hcArgs,
        AssemblyRegionArgumentCollection assemblyRegionArgs, final SAMFileHeader header,
        final ReferenceSequenceFile taskReferenceSequenceFile,
        final VariantAnnotatorEngine annotatorEngine) {
    Supplier<AssemblyRegionEvaluator> supplier = new Supplier<AssemblyRegionEvaluator>() {
        @Override
        public AssemblyRegionEvaluator get() {
            return new HaplotypeCallerEngine(hcArgs, assemblyRegionArgs, false, false, header, taskReferenceSequenceFile, annotatorEngine);
        }
    };
    return ctx.broadcast(supplier);
}
 
Example #29
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast(final JavaSparkContext ctx) {
    final Path referencePath = IOUtils.getPath(referenceArguments.getReferenceFileName());
    final String referenceFileName = referencePath.getFileName().toString();
    final String pathOnExecutor = SparkFiles.get(referenceFileName);
    final ReferenceSequenceFile taskReferenceSequenceFile = new CachingIndexedFastaSequenceFile(IOUtils.getPath(pathOnExecutor));
    final Collection<Annotation> annotations = makeVariantAnnotations();
    final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
    return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, assemblyRegionArgs, getHeaderForReads(), taskReferenceSequenceFile, annotatorEngine);
}
 
Example #30
Source File: Utils.java    From cramtools with Apache License 2.0 5 votes vote down vote up
public static byte[] getBasesFromReferenceFile(String referenceFilePath, String seqName, int from, int length) {
	ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(
			referenceFilePath));
	ReferenceSequence sequence = referenceSequenceFile.getSequence(seqName);
	byte[] bases = referenceSequenceFile.getSubsequenceAt(sequence.getName(), from, from + length).getBases();
	return bases;
}