htsjdk.samtools.reference.ReferenceSequenceFileWalker Java Examples

The following examples show how to use htsjdk.samtools.reference.ReferenceSequenceFileWalker. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SetNmMdAndUqTags.java    From picard with MIT License 6 votes vote down vote up
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

    if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new SAMException("Input must be coordinate-sorted for this program to run. Found: " + reader.getFileHeader().getSortOrder());
    }

    final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT);
    writer.setProgressLogger(
            new ProgressLogger(log, (int) 1e7, "Wrote", "records"));

    final ReferenceSequenceFileWalker refSeqWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

    StreamSupport.stream(reader.spliterator(), false)
            .peek(rec -> fixRecord(rec, refSeqWalker))
            .forEach(writer::addAlignment);
    CloserUtil.close(reader);
    writer.close();
    return 0;
}
 
Example #2
Source File: BaseErrorCalculationTest.java    From picard with MIT License 6 votes vote down vote up
@Test(dataProvider = "testOverlappingErrorCalculatorWithManyReadsData", timeOut = 5000)
public void testOverlappingErrorCalculatorWithManyReads(final File temp) throws IOException {

    try (final ReferenceSequenceFileWalker referenceSequenceFileWalker =
                 new ReferenceSequenceFileWalker(CommandLineProgramTest.CHR_M_REFERENCE.getAbsoluteFile());
         final SamLocusIterator samLocusIterator = new SamLocusIterator(SamReaderFactory.make().open(temp));
         final SamLocusAndReferenceIterator samLocusAndReferences = new SamLocusAndReferenceIterator(
                 referenceSequenceFileWalker, samLocusIterator)) {

        BaseErrorAggregation<OverlappingReadsErrorCalculator> aggregation =
                new BaseErrorAggregation<>(OverlappingReadsErrorCalculator::new, ReadBaseStratification.baseCycleStratifier);

        for (final SamLocusAndReferenceIterator.SAMLocusAndReference locusAndReference : samLocusAndReferences) {
            for (SamLocusIterator.RecordAndOffset recordAndOffset : locusAndReference.getRecordAndOffsets())

                aggregation.addBase(recordAndOffset, locusAndReference);
        }
    }
}
 
Example #3
Source File: BaseErrorCalculationTest.java    From picard with MIT License 6 votes vote down vote up
@DataProvider
public Object[][] testOverlappingErrorCalculatorWithManyReadsData() throws IOException {
    final File temp = File.createTempFile("Overlapping", ".bam");
    temp.deleteOnExit();

    try (
            final ReferenceSequenceFileWalker referenceSequenceFileWalker =
                    new ReferenceSequenceFileWalker(CommandLineProgramTest.CHR_M_REFERENCE)) {

        final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();
        builder.getHeader().setSequenceDictionary(referenceSequenceFileWalker.getSequenceDictionary());

        for (int i = 0; i < 4000; i++) {
            builder.addPair("Read" + i, 0, 1, 1,
                    false, false, "36M", "36M", true, false, 20);
        }

        try (final SAMFileWriter writer = new SAMFileWriterFactory()
                .setCompressionLevel(2)
                .makeBAMWriter(builder.getHeader(), false, temp)) {
            builder.forEach(writer::addAlignment);
        }
    }
    return new Object[][]{{temp}};
}
 
Example #4
Source File: AllelicCountCollector.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Constructs a {@link AllelicCountCollector} object for calculating {@link AllelicCountCollection} objects from
 * BAMs based on a reference genome.
 * @param referenceFile         file containing the reference
 * @param validationStringency  validation stringency to use for reading BAM files
 */
public AllelicCountCollector(final File referenceFile,
                             final ValidationStringency validationStringency) {
    IOUtils.canReadFile(referenceFile);
    readerFactory = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(referenceFile);
    referenceWalker = new ReferenceSequenceFileWalker(referenceFile);
}
 
Example #5
Source File: SetNmMdAndUqTags.java    From picard with MIT License 5 votes vote down vote up
private void fixRecord(SAMRecord record, ReferenceSequenceFileWalker refSeqWalker){
    if (!record.getReadUnmappedFlag()) {
        if (SET_ONLY_UQ) {
            AbstractAlignmentMerger.fixUq(record, refSeqWalker, IS_BISULFITE_SEQUENCE);
        } else {
            AbstractAlignmentMerger.fixNmMdAndUq(record, refSeqWalker, IS_BISULFITE_SEQUENCE);
        }
    }
}
 
Example #6
Source File: AbstractAlignmentMerger.java    From picard with MIT License 5 votes vote down vote up
/** Calculates and sets the NM, MD, and and UQ tags from the record and the reference
 *
 * @param record the record to be fixed
 * @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
 * @param isBisulfiteSequence a flag indicating whether the sequence came from bisulfite-sequencing which would imply a different
 * calculation of the NM tag.
 *
 * No return value, modifies the provided record.
 */
public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
    final byte[] referenceBases = refSeqWalker.get(record.getReferenceIndex()).getBases();
    // only recalculate NM if it isn't bisulfite, since it needs to be treated specially below
    SequenceUtil.calculateMdAndNmTags(record, referenceBases, true, !isBisulfiteSequence);
    if (isBisulfiteSequence) {  // recalculate the NM tag for bisulfite data
        record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));
    }
    fixUq(record, refSeqWalker, isBisulfiteSequence);
}
 
Example #7
Source File: WgsMetricsProcessorImplTest.java    From picard with MIT License 5 votes vote down vote up
@Test
public void testForFilteredBases(){
    AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead);
    CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics();
    FastWgsMetricsCollector collector =  new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList());
    String secondReferenceString = ">ref\nNNNNNNNNNNAATATTCTTC";
    ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(secondReferenceString);
    ReferenceSequenceFileWalker refWalker =  new ReferenceSequenceFileWalker(referenceSequenceFile);
    WgsMetricsProcessorImpl wgsMetricsProcessor =
            new WgsMetricsProcessorImpl(iterator, refWalker, collector, progress);
    wgsMetricsProcessor.processFile();
    assertEquals(10, collector.counter);
}
 
Example #8
Source File: WgsMetricsProcessorImpl.java    From picard with MIT License 5 votes vote down vote up
/**
 * @param iterator  input {@link htsjdk.samtools.util.AbstractLocusIterator}
 * @param refWalker over processed reference file
 * @param collector input {@link picard.analysis.AbstractWgsMetricsCollector}
 * @param progress  logger
 */
public WgsMetricsProcessorImpl(AbstractLocusIterator<T, AbstractLocusInfo<T>> iterator,
        ReferenceSequenceFileWalker refWalker,
        AbstractWgsMetricsCollector<T> collector,
        ProgressLogger progress) {
    this.iterator = iterator;
    this.collector = collector;
    this.refWalker = refWalker;
    this.progress = progress;
}
 
Example #9
Source File: AbstractAlignmentMerger.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
protected void resetRefSeqFileWalker() {
    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);
}
 
Example #10
Source File: GatherGeneGCLength.java    From Drop-seq with MIT License 4 votes vote down vote up
@Override
   protected int doWork() {

       IOUtil.assertFileIsReadable(ANNOTATIONS_FILE);
       IOUtil.assertFileIsWritable(this.OUTPUT);
       PrintStream out = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT));
       writeHeader(out);

       PrintStream outTranscript = null;
       if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) {
		outTranscript = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT_TRANSCRIPT_LEVEL));
		writeHeaderTranscript(outTranscript);
       }

       FastaSequenceFileWriter  outSequence = null;

       if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) {
       	IOUtil.assertFileIsWritable(this.OUTPUT_TRANSCRIPT_SEQUENCES);
		outSequence = new FastaSequenceFileWriter (this.OUTPUT_TRANSCRIPT_SEQUENCES);
       }
       ReferenceSequenceFileWalker refFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

       SAMSequenceDictionary dict= refFileWalker.getSequenceDictionary();
       if (dict==null) {
       	CloserUtil.close(refFileWalker);
       	throw new IllegalArgumentException("Reference file" + this.REFERENCE_SEQUENCE.getAbsolutePath()+" is missing a dictionary file [.dict].  Please make one!");
       }

       OverlapDetector<Gene> geneOverlapDetector= GeneAnnotationReader.loadAnnotationsFile(ANNOTATIONS_FILE, dict);

       List<SAMSequenceRecord> records = dict.getSequences();

	for (SAMSequenceRecord record: records) {
		String seqName = record.getSequenceName();
		int seqIndex=dict.getSequenceIndex(seqName);
		ReferenceSequence fastaRef=refFileWalker.get(seqIndex);

		// get the genes for this contig.
		Interval i = new Interval(seqName, 1, record.getSequenceLength());
		Collection< Gene> genes = geneOverlapDetector.getOverlaps(i);
		for (Gene g: genes) {
			List<GCResult> gcList = calculateGCContentGene(g, fastaRef, dict);
			if (this.OUTPUT_TRANSCRIPT_LEVEL!=null)
				writeResultTranscript(gcList, outTranscript);
			GCIsoformSummary summary = new GCIsoformSummary(g, gcList);
			if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null)
				writeTranscriptSequence(g, fastaRef, dict, outSequence);

			GCResult gc = calculateGCContentUnionExons(g, fastaRef, dict);

			writeResult(gc, summary, out);
		}
	}
	CloserUtil.close(refFileWalker);
	CloserUtil.close(out);
	if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) CloserUtil.close(outTranscript);
	if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) CloserUtil.close(outSequence);
       return 0;
}
 
Example #11
Source File: AbstractAlignmentMerger.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Constructor
 *
 * @param unmappedBamFile                   The BAM file that was used as the input to the aligner, which will
 *                                          include info on all the reads that did not map.  Required.
 * @param targetBamFile                     The file to which to write the merged SAM records. Required.
 * @param referenceFasta                    The reference sequence for the map files. Required.
 * @param clipAdapters                      Whether adapters marked in unmapped BAM file should be marked as
 *                                          soft clipped in the merged bam. Required.
 * @param bisulfiteSequence                 Whether the reads are bisulfite sequence (used when calculating the
 *                                          NM and UQ tags). Required.
 * @param alignedReadsOnly                  Whether to output only those reads that have alignment data
 * @param programRecord                     Program record for target file SAMRecords created.
 * @param attributesToRetain                private attributes from the alignment record that should be
 *                                          included when merging.  This overrides the exclusion of
 *                                          attributes whose tags start with the reserved characters
 *                                          of X, Y, and Z
 * @param attributesToRemove                attributes from the alignment record that should be
 *                                          removed when merging.  This overrides attributesToRetain if they share
 *                                          common tags.
 * @param read1BasesTrimmed                 The number of bases trimmed from start of read 1 prior to alignment.  Optional.
 * @param read2BasesTrimmed                 The number of bases trimmed from start of read 2 prior to alignment.  Optional.
 * @param expectedOrientations              A List of SamPairUtil.PairOrientations that are expected for
 *                                          aligned pairs.  Used to determine the properPair flag.
 * @param sortOrder                         The order in which the merged records should be output.  If null,
 *                                          output will be coordinate-sorted
 * @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple
 *                                          alignments but none primary, for a read or read pair.
 * @param addMateCigar                      True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
 */
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
                               final File referenceFasta, final boolean clipAdapters,
                               final boolean bisulfiteSequence, final boolean alignedReadsOnly,
                               final SAMProgramRecord programRecord, final List<String> attributesToRetain,
                               final List<String> attributesToRemove,
                               final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
                               final List<SamPairUtil.PairOrientation> expectedOrientations,
                               final SAMFileHeader.SortOrder sortOrder,
                               final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
                               final boolean addMateCigar) {
    IOUtil.assertFileIsReadable(unmappedBamFile);
    IOUtil.assertFileIsWritable(targetBamFile);
    IOUtil.assertFileIsReadable(referenceFasta);

    this.unmappedBamFile = unmappedBamFile;
    this.targetBamFile = targetBamFile;
    this.referenceFasta = referenceFasta;

    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);
    this.sequenceDictionary = refSeq.getSequenceDictionary();
    if (this.sequenceDictionary == null) {
        throw new UserException("No sequence dictionary found for " + referenceFasta.getAbsolutePath() +
                ".  Use CreateSequenceDictionary.jar to create a sequence dictionary.");
    }

    this.clipAdapters = clipAdapters;
    this.bisulfiteSequence = bisulfiteSequence;
    this.alignedReadsOnly = alignedReadsOnly;

    this.header = new SAMFileHeader();
    this.sortOrder = sortOrder != null ? sortOrder : SAMFileHeader.SortOrder.coordinate;
    header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    if (programRecord != null) {
        setProgramRecord(programRecord);
    }
    header.setSequenceDictionary(this.sequenceDictionary);
    if (attributesToRetain != null) {
        this.attributesToRetain.addAll(attributesToRetain);
    }
    if (attributesToRemove != null) {
        this.attributesToRemove.addAll(attributesToRemove);
        // attributesToRemove overrides attributesToRetain
        if (!this.attributesToRetain.isEmpty()) {
            for (String attribute : this.attributesToRemove) {
                if (this.attributesToRetain.contains(attribute)) {
                    logger.info("Overriding retaining the " + attribute + " tag since remove overrides retain.");
                    this.attributesToRetain.remove(attribute);
                }
            }
        }
    }
    this.read1BasesTrimmed = read1BasesTrimmed;
    this.read2BasesTrimmed = read2BasesTrimmed;
    this.expectedOrientations = expectedOrientations;

    this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy;

    this.addMateCigar = addMateCigar;
}
 
Example #12
Source File: CollectWgsMetricsTestUtils.java    From picard with MIT License 4 votes vote down vote up
static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(){
    String referenceString = ">ref\nACCTACGTTCAATATTCTTC";
    return getReferenceSequenceFileWalker(referenceString);
}
 
Example #13
Source File: CollectWgsMetricsTestUtils.java    From picard with MIT License 4 votes vote down vote up
static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(final String referenceString){
    ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(referenceString);
    return new ReferenceSequenceFileWalker(referenceSequenceFile);
}
 
Example #14
Source File: CollectSamErrorMetricsTest.java    From picard with MIT License 4 votes vote down vote up
@Test(dataProvider = "provideForTestIndelErrors")
public void testIndelErrors(final String[] readCigars, final String errorSubscript, IndelErrorMetric expectedMetric) throws IOException {

    final File temp = File.createTempFile("Indels", ".bam");
    temp.deleteOnExit();

    final int priorQ = 30;

    try (final ReferenceSequenceFileWalker referenceSequenceFileWalker =
                    new ReferenceSequenceFileWalker(CommandLineProgramTest.CHR_M_REFERENCE)) {

        final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();
        builder.getHeader().setSequenceDictionary(referenceSequenceFileWalker.getSequenceDictionary());

        for (int i = 0; i < readCigars.length; i++) {
            // 10M2I3M4D10M5I
            builder.addFrag("Read" + i, 0, 100, false, false, readCigars[i], null, 30);
        }

        try (final SAMFileWriter writer = new SAMFileWriterFactory()
                .setCompressionLevel(2)
                .makeBAMWriter(builder.getHeader(), false, temp)) {
            builder.forEach(writer::addAlignment);
        }
    }

    final File referenceFile = CHR_M_REFERENCE;
    final File vcf = new File(TEST_DIR, "NIST.selected.vcf");

    final File outputBaseFileName = new File(OUTPUT_DATA_PATH, "test");
    final File errorByAll = new File(outputBaseFileName.getAbsolutePath() + errorSubscript);
    errorByAll.deleteOnExit();
    outputBaseFileName.deleteOnExit();

    final String[] args = {
            "INPUT=" + temp,
            "OUTPUT=" + outputBaseFileName,
            "REFERENCE_SEQUENCE=" + referenceFile.getAbsolutePath(),
            "ERROR_METRICS=" + "INDEL_ERROR",
            "ERROR_METRICS=" + "INDEL_ERROR:INDEL_LENGTH",
            "VCF=" + vcf.getAbsolutePath()
    };

    CollectSamErrorMetrics collectSamErrorMetrics = new CollectSamErrorMetrics();
    collectSamErrorMetrics.ERROR_METRICS.clear();

    Assert.assertEquals(collectSamErrorMetrics.instanceMain(args), 0);

    ErrorMetric.setPriorError(QualityUtil.getErrorProbabilityFromPhredScore(priorQ));
    expectedMetric.calculateDerivedFields();

    // Note that soft clipped bases are not counted
    List<IndelErrorMetric> metrics = MetricsFile.readBeans(errorByAll);

    IndelErrorMetric metric = metrics
            .stream()
            .filter(m -> m.COVARIATE.equals(expectedMetric.COVARIATE))
            .findAny()
            .orElseThrow(() -> new AssertionError("didn't find metric with COVARIATE==" + expectedMetric.COVARIATE));

    Assert.assertEquals(metric, expectedMetric);
}
 
Example #15
Source File: CollectWgsMetrics.java    From picard with MIT License 4 votes vote down vote up
private <T extends AbstractRecordAndOffset> WgsMetricsProcessorImpl<T> getWgsMetricsProcessor(
        ProgressLogger progress, ReferenceSequenceFileWalker refWalker,
        AbstractLocusIterator<T, AbstractLocusInfo<T>> iterator, AbstractWgsMetricsCollector<T> collector) {
    return new WgsMetricsProcessorImpl<>(iterator, refWalker, collector, progress);
}
 
Example #16
Source File: CollectWgsMetrics.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    INTERVALS = intervalArugmentCollection.getIntervalFile();
    if (INTERVALS != null) {
        IOUtil.assertFileIsReadable(INTERVALS);
    }
    if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
        IOUtil.assertFileIsWritable(THEORETICAL_SENSITIVITY_OUTPUT);
    }

    // it doesn't make sense for the locus accumulation cap to be lower than the coverage cap
    if (LOCUS_ACCUMULATION_CAP < COVERAGE_CAP) {
        log.warn("Setting the LOCUS_ACCUMULATION_CAP to be equal to the COVERAGE_CAP (" + COVERAGE_CAP + ") because it should not be lower");
        LOCUS_ACCUMULATION_CAP = COVERAGE_CAP;
    }

    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in = getSamReader();
    final AbstractLocusIterator iterator = getLocusIterator(in);

    final List<SamRecordFilter> filters = new ArrayList<>();
    final CountingFilter adapterFilter = new CountingAdapterFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    // The order in which filters are added matters!
    filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice
    filters.add(adapterFilter);
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    if (!COUNT_UNPAIRED) {
        filters.add(pairFilter);
    }
    iterator.setSamFilters(filters);
    iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setIncludeNonPfReads(false);

    final AbstractWgsMetricsCollector<?> collector = getCollector(COVERAGE_CAP, getIntervalsToExamine());
    final WgsMetricsProcessor processor = getWgsMetricsProcessor(progress, refWalker, iterator, collector);
    processor.processFile();

    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, adapterFilter, mapqFilter, pairFilter);
    out.write(OUTPUT);

    if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
        // Write out theoretical sensitivity results.
        final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
        log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
        List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION);
        theoreticalSensitivityMetrics.addAllMetrics(tsm);
        theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
    }

    return 0;
}
 
Example #17
Source File: CollectRrbsMetrics.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    if (!METRICS_FILE_PREFIX.endsWith(".")) {
        METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + ".";
    }
    final File SUMMARY_OUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION);
    final File DETAILS_OUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION);
    final File PLOTS_OUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION);
    assertIoFiles(SUMMARY_OUT, DETAILS_OUT, PLOTS_OUT);

    final SamReader samReader = SamReaderFactory.makeDefault().open(INPUT);
    if (!ASSUME_SORTED && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new PicardException("The input file " + INPUT.getAbsolutePath() + " does not appear to be coordinate sorted");
    }

    final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final ProgressLogger progressLogger = new ProgressLogger(log);

    final RrbsMetricsCollector metricsCollector = new RrbsMetricsCollector(METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(),
            C_QUALITY_THRESHOLD, NEXT_BASE_QUALITY_THRESHOLD, MINIMUM_READ_LENGTH, MAX_MISMATCH_RATE);

    for (final SAMRecord samRecord : samReader) {
        progressLogger.record(samRecord);
        if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) {
            final ReferenceSequence referenceSequence = refWalker.get(samRecord.getReferenceIndex());
            metricsCollector.acceptRecord(samRecord, referenceSequence);
        }
    }
    metricsCollector.finish();
    final MetricsFile<RrbsMetrics, Comparable<?>> rrbsMetrics = getMetricsFile();
    metricsCollector.addAllLevelsToFile(rrbsMetrics);

    // Using RrbsMetrics as a way to get both of the metrics objects through the MultiLevelCollector. Once
    // we get it out split it apart to the two separate MetricsFiles and write them to file
    final MetricsFile<RrbsSummaryMetrics, ?> summaryFile = getMetricsFile();
    final MetricsFile<RrbsCpgDetailMetrics, ?> detailsFile = getMetricsFile();
    for (final RrbsMetrics rrbsMetric : rrbsMetrics.getMetrics()) {
        summaryFile.addMetric(rrbsMetric.getSummaryMetrics());
        for (final RrbsCpgDetailMetrics detailMetric : rrbsMetric.getDetailMetrics()) {
            detailsFile.addMetric(detailMetric);
        }
    }
    summaryFile.write(SUMMARY_OUT);
    detailsFile.write(DETAILS_OUT);
    RExecutor.executeFromClasspath(R_SCRIPT, DETAILS_OUT.getAbsolutePath(), SUMMARY_OUT.getAbsolutePath(), PLOTS_OUT.getAbsolutePath());
    CloserUtil.close(samReader);
    return 0;
}
 
Example #18
Source File: CollectSamErrorMetrics.java    From picard with MIT License 4 votes vote down vote up
/**
 * Creates the {@link SamLocusAndReferenceIterator} object to use to traverse the input files.
 * Also initializes the {@link #sequenceDictionary} and performs some checks on the output
 * files in {@link #aggregatorList} to make sure we can write our output.
 * @param sam an open {@link SamReader} from which to pull reads.
 * @param referenceSequenceFileWalker an open {@link ReferenceSequenceFileWalker} from which to get reference sequence information.
 * @return An open {@link SamLocusAndReferenceIterator} ready to iterate.
 */
private SamLocusAndReferenceIterator createSamLocusAndReferenceIterator(final SamReader sam, final ReferenceSequenceFileWalker referenceSequenceFileWalker) {

    sequenceDictionary = referenceSequenceFileWalker.getSequenceDictionary();
    if (sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new PicardException("Input BAM must be sorted by coordinate");
    }

    // Make sure our reference and reads have the same sequence dictionary:
    sequenceDictionary.assertSameDictionary(sam.getFileHeader().getSequenceDictionary());

    final IntervalList regionOfInterest = getIntervals(sequenceDictionary);

    log.info("Getting SamLocusIterator");

    final SamLocusIterator samLocusIterator = new SamLocusIterator(sam, regionOfInterest);

    // We want to know about indels:
    samLocusIterator.setIncludeIndels(true);

    // Now go through and compare information at each of these sites
    samLocusIterator.setEmitUncoveredLoci(false);
    samLocusIterator.setMappingQualityScoreCutoff(MIN_MAPPING_Q);
    samLocusIterator.setQualityScoreCutoff(MIN_BASE_Q);

    log.info("Using " + aggregatorList.size() + " aggregators.");

    aggregatorList.forEach(la ->
            IOUtil.assertFileIsWritable(new File(OUTPUT + la.getSuffix())));

    // iterate over loci
    log.info("Starting iteration over loci");

    final SamLocusAndReferenceIterator iterator = new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator);

    // This hasNext() call has side-effects. It loads up the index and makes sure that
    // the iterator is really ready for
    // action. Calling this allows for the logging to be more accurate.
    iterator.hasNext();
    return iterator;
}
 
Example #19
Source File: AbstractAlignmentMerger.java    From picard with MIT License 4 votes vote down vote up
protected void resetRefSeqFileWalker() {
    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);
}
 
Example #20
Source File: AbstractAlignmentMerger.java    From picard with MIT License 4 votes vote down vote up
/**
 * Constructor
 *
 * @param unmappedBamFile                   The BAM file that was used as the input to the aligner, which will
 *                                          include info on all the reads that did not map.  Required.
 * @param targetBamFile                     The file to which to write the merged SAM records. Required.
 * @param referenceFasta                    The reference sequence for the map files. Required.
 * @param clipAdapters                      Whether adapters marked in unmapped BAM file should be marked as
 *                                          soft clipped in the merged bam. Required.
 * @param bisulfiteSequence                 Whether the reads are bisulfite sequence (used when calculating the
 *                                          NM and UQ tags). Required.
 * @param alignedReadsOnly                  Whether to output only those reads that have alignment data
 * @param programRecord                     Program record for target file SAMRecords created.
 * @param attributesToRetain                private attributes from the alignment record that should be
 *                                          included when merging.  This overrides the exclusion of
 *                                          attributes whose tags start with the reserved characters
 *                                          of X, Y, and Z
 * @param attributesToRemove                attributes from the alignment record that should be
 *                                          removed when merging.  This overrides attributesToRetain if they share
 *                                          common tags.
 * @param read1BasesTrimmed                 The number of bases trimmed from start of read 1 prior to alignment.  Optional.
 * @param read2BasesTrimmed                 The number of bases trimmed from start of read 2 prior to alignment.  Optional.
 * @param expectedOrientations              A List of SamPairUtil.PairOrientations that are expected for
 *                                          aligned pairs.  Used to determine the properPair flag.
 * @param sortOrder                         The order in which the merged records should be output.  If null,
 *                                          output will be coordinate-sorted
 * @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple
 *                                          alignments but none primary, for a read or read pair.
 * @param addMateCigar                      True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
 * @param unmapContaminantReads             If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases),
 *                                          and mark them as unmapped.
 * @param unmappingReadsStrategy            An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species
 *                                          contamination). Ignored unless unmapContaminantReads is true.
 */
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
                               final File referenceFasta, final boolean clipAdapters,
                               final boolean bisulfiteSequence, final boolean alignedReadsOnly,
                               final SAMProgramRecord programRecord, final List<String> attributesToRetain,
                               final List<String> attributesToRemove,
                               final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
                               final List<SamPairUtil.PairOrientation> expectedOrientations,
                               final SortOrder sortOrder,
                               final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
                               final boolean addMateCigar,
                               final boolean unmapContaminantReads,
                               final UnmappingReadStrategy unmappingReadsStrategy) {
    IOUtil.assertFileIsReadable(unmappedBamFile);
    IOUtil.assertFileIsWritable(targetBamFile);
    IOUtil.assertFileIsReadable(referenceFasta);

    this.unmappedBamFile = unmappedBamFile;
    this.targetBamFile = targetBamFile;
    this.referenceFasta = referenceFasta;

    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);

    this.clipAdapters = clipAdapters;
    this.bisulfiteSequence = bisulfiteSequence;
    this.alignedReadsOnly = alignedReadsOnly;

    this.header = new SAMFileHeader();
    this.sortOrder = sortOrder != null ? sortOrder : SortOrder.coordinate;
    header.setSortOrder(SortOrder.coordinate);
    if (programRecord != null) {
        setProgramRecord(programRecord);
    }

    if (attributesToRetain != null) {
        this.attributesToRetain.addAll(attributesToRetain);
    }
    if (attributesToRemove != null) {
        this.attributesToRemove.addAll(attributesToRemove);
        // attributesToRemove overrides attributesToRetain
        if (!this.attributesToRetain.isEmpty()) {
            this.attributesToRemove.stream()
                    .filter(this.attributesToRetain::contains)
                    .peek(a->log.info("Overriding retaining the " + a + " tag since 'remove' overrides 'retain'."))
                    .forEach(this.attributesToRetain::remove);
        }
    }
    this.read1BasesTrimmed = read1BasesTrimmed;
    this.read2BasesTrimmed = read2BasesTrimmed;
    this.expectedOrientations = expectedOrientations;

    this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy;

    this.addMateCigar = addMateCigar;
    this.unmapContaminantReads = unmapContaminantReads;
    this.unmappingReadsStrategy = unmappingReadsStrategy;
}
 
Example #21
Source File: HetPulldownCalculator.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
 * where intervals give positions of likely heterozygous SNP sites.
 *
 * <p>
 *     For a normal sample:
 *     <ul>
 *         The IntervalList snpIntervals gives common SNP sites in 1-based format.
 *     </ul>
 *     <ul>
 *         The p-value threshold must be specified for a two-sided binomial test,
 *         which is used to determine SNP sites from snpIntervals that are
 *         compatible with a heterozygous SNP, given the sample.  Only these sites are output.
 *     </ul>
 * </p>
 * <p>
 *     For a tumor sample:
 *     <ul>
 *         The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
 *         This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
 *         Only these sites are output.
 *     </ul>
 * </p>
 * @param bamFile           sorted BAM file for sample
 * @param snpIntervals      IntervalList of SNP sites
 * @param sampleType        flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
 *                          (determines whether to perform binomial test)
 * @param pvalThreshold     p-value threshold for two-sided binomial test, used for normal sample
 * @param minimumRawReads   minimum number of total reads that must be present at a het site
 * @return                  Pulldown of heterozygous SNP sites in 1-based format
 */
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType,
                                final double pvalThreshold, final int minimumRawReads) {
    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency)
            .referenceSequence(refFile).open(bamFile);
         final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }

        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());

        final int totalNumberOfSNPs = snpIntervals.size();
        final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals,
                totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);

        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(),
                new DuplicateReadFilter());
        locusIterator.setSamFilters(samFilters);
        locusIterator.setEmitUncoveredLoci(false);
        locusIterator.setIncludeNonPfReads(false);
        locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
        locusIterator.setQualityScoreCutoff(minBaseQuality);

        logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " covered sites.");
            }
            locusCount++;

            //include N, etc. reads here
            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount < minimumRawReads) {
                continue;
            }

            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();

            if (sampleType == SampleType.NORMAL &&
                    !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
                continue;
            }

            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            final int refReadCount = (int) baseCounts.get(refBase);
            final int altReadCount = totalBaseCount - refReadCount;

            hetPulldown.add(new AllelicCount(
                    new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()),
                    refReadCount, altReadCount));
        }
        logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
        return hetPulldown;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
    }
}
 
Example #22
Source File: BayesianHetPulldownCalculator.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * For a given normal or tumor BAM file, walks through the list of common SNPs,
 * {@link BayesianHetPulldownCalculator#snpIntervals}), detects heterozygous sites, and returns
 * a {@link Pulldown} containing detailed information on the called heterozygous SNP sites.
 *
 * The {@code hetCallingStrigency} parameters sets the threshold posterior for calling a Het SNP site:
 *
 *      hetPosteriorThreshold = 1 - 10^{-hetCallingStringency}
 *      hetThresholdLogOdds = log(hetPosteriorThreshold/(1-hetPosteriorThreshold))
 *                          = log(10^{hetCallingStringency} - 1)
 *
 * (see CNV-methods.pdf for details)
 *
 * @param bamFile sorted BAM file for sample
 * @param hetCallingStringency strigency for calling a Het site
 * @return Pulldown of heterozygous SNP sites in 1-based format
 */
public Pulldown getHetPulldown(final File bamFile, final double hetCallingStringency) {
    /* log odds from stringency */
    final double hetThresholdLogOdds = FastMath.log(FastMath.pow(10, hetCallingStringency) - 1);

    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency)
            .referenceSequence(refFile).open(bamFile);
         final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }

        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
        final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);

        final int totalNumberOfSNPs = snpIntervals.size();
        logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " covered sites.");
            }
            locusCount++;

            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount <= readDepthThreshold) {
                continue;
            }
            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex())
                    .getBases()[locus.getPosition() - 1]);
            if (!isProperBase(refBase)) {
                logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Even though" +
                        " this position is indicated to be a possible heterozygous SNP in the provided SNP interval list," +
                        " no inference can be made. Continuing ...", locus.getPosition(), refBase.toString()));
                continue;
            }

            final Map<Nucleotide, List<BaseQuality>> baseQualities = getPileupBaseQualities(locus);
            final Nucleotide altBase = inferAltFromPileup(baseQualities, refBase);

            /* calculate Het log odds */
            final double hetLogLikelihood = getHetLogLikelihood(baseQualities, refBase, altBase);
            final double homLogLikelihood = getHomLogLikelihood(baseQualities, refBase, altBase,
                    DEFAULT_PRIOR_REF_HOM);
            final double hetLogOdds = (hetLogLikelihood + FastMath.log(DEFAULT_PRIOR_HET)) -
                    (homLogLikelihood + FastMath.log(1 - DEFAULT_PRIOR_HET));

            if (hetLogOdds > hetThresholdLogOdds) {
                hetPulldown.add(new AllelicCount(
                        new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()),
                        baseQualities.get(refBase).size(), baseQualities.get(altBase).size(),
                        refBase, altBase, totalReadCount, hetLogOdds));
            }
        }

        logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");

        return hetPulldown;

    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
    }
}
 
Example #23
Source File: AbstractAlignmentMerger.java    From picard with MIT License 3 votes vote down vote up
/** Calculates and sets UQ tag from the record and the reference
 *
 * @param record the record to be fixed
 * @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
 * @param isBisulfiteSequence a flag indicating whether the sequence came from bisulfite-sequencing.
 *
 * No return value, modifies the provided record.
 */
public static void fixUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
    if (record.getBaseQualities() != SAMRecord.NULL_QUALS) {
        final byte[] referenceBases = refSeqWalker.get(record.getReferenceIndex()).getBases();
        record.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(record, referenceBases, 0, isBisulfiteSequence));
    }
}