Java Code Examples for htsjdk.samtools.SAMFileHeader#setSortOrder()

The following examples show how to use htsjdk.samtools.SAMFileHeader#setSortOrder() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GetSortedBAMHeader.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	if (args.length < 2) {
		System.err.println(
			"Usage: GetSortedBAMHeader input output\n\n"+

			"Reads the BAM header from input (a standard BGZF-compressed BAM "+
			"file), and\nwrites it (BGZF-compressed, no terminator block) to "+
			"output. Sets the sort order\nindicated in the SAM header to "+
			"'coordinate'.");
		System.exit(1);
	}

	final SAMFileHeader h =
			SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT)
					.setUseAsyncIo(false)
					.open(new File(args[0])).getFileHeader();
	h.setSortOrder(SAMFileHeader.SortOrder.coordinate);

       try (FileOutputStream stream = new FileOutputStream(args[1])) {
           new SAMOutputPreparer().prepareForRecords(stream, SAMFormat.BAM, h);
       }
}
 
Example 2
Source File: GenomeLocParserUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testContigHasColon() {
    SAMFileHeader header = new SAMFileHeader();
    header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
    SAMSequenceDictionary dict = new SAMSequenceDictionary();
    SAMSequenceRecord rec = new SAMSequenceRecord("c:h:r1", 10);
    rec.setSequenceLength(10);
    dict.addSequence(rec);
    header.setSequenceDictionary(dict);

    final GenomeLocParser myGenomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
    GenomeLoc loc = myGenomeLocParser.parseGenomeLoc("c:h:r1:4-5");
    assertEquals(0, loc.getContigIndex());
    assertEquals(loc.getStart(), 4);
    assertEquals(loc.getStop(), 5);
}
 
Example 3
Source File: SdfStatistics.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
static void printSAMHeader(SequencesReader reader, final Appendable out, boolean specified) throws IOException {
  final ReferenceGenome rg = new ReferenceGenome(reader, ReferenceGenome.SEX_ALL, ReferenceGenome.ReferencePloidy.AUTO);
  final SAMFileHeader header = new SAMFileHeader();
  header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
  SamUtils.addProgramRecord(header);
  final int[] lengths = reader.sequenceLengths(0, reader.numberSequences());
  for (int i = 0; i < lengths.length; ++i) {
    final String name = reader.hasNames() ? reader.name(i) : ("sequence_" + i);
    final ReferenceSequence s = specified ? rg.sequence(name) : null;
    if (s == null || s.isSpecified()) {
      final SAMSequenceRecord record = new SAMSequenceRecord(name, lengths[i]);
      header.addSequence(record);
    }
  }
  if (reader.getSdfId().available()) {
    header.addComment(SamUtils.TEMPLATE_SDF_ATTRIBUTE + reader.getSdfId());
  }
  final ByteArrayOutputStream bo = new ByteArrayOutputStream();
  new SAMFileWriterFactory().makeSAMWriter(header, true, bo).close();
  out.append(bo.toString());
}
 
Example 4
Source File: HaplotypeMapTest.java    From picard with MIT License 6 votes vote down vote up
@DataProvider(name="haplotypeMapForWriting")
public Object[][] haplotypeMapForWriting() {

    SAMFileHeader header = new SAMFileHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    SAMSequenceDictionary sd = new SAMSequenceDictionary();
    sd.addSequence(new SAMSequenceRecord("chr1", 101));
    sd.addSequence(new SAMSequenceRecord("chr2", 101));
    sd.addSequence(new SAMSequenceRecord("chr3", 101));
    header.setSequenceDictionary(sd);


    HaplotypeMap newMap = new HaplotypeMap(header);
    HaplotypeBlock t1 = new HaplotypeBlock(0.151560926);
    t1.addSnp(new Snp("snp2", "chr1", 83, (byte)'A', (byte)'G', .16, null));
    t1.addSnp(new Snp("snp1", "chr1", 51, (byte)'T', (byte)'C', 0.15, Collections.singletonList("SQNM_1CHIP_FingerprintAssays")));
    newMap.addHaplotype(t1);
    HaplotypeBlock t2 = new HaplotypeBlock(.02d);
    t2.addSnp(new Snp("snp3", "chr2", 24, (byte)'C', (byte)'A', .20, null));
    newMap.addHaplotype(t2);

    return new Object[][]{{newMap}};
}
 
Example 5
Source File: GatherGeneGCLength.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * For a GC record and a fasta sequence, calculate the GC content.
 * Builds intervals of the unique sequences overlapped by exons, calculates the GC content for each, and aggregates results.
 * @param fastaRef
 * @return
 */
private GCResult calculateGCContentUnionExons(final Gene gene, final ReferenceSequence fastaRef, final SAMSequenceDictionary dict) {
	// make an interval list.
	SAMFileHeader h = new SAMFileHeader();
	h.setSequenceDictionary(dict);
	h.setSortOrder(SAMFileHeader.SortOrder.unsorted);
	IntervalList intervalList  = new IntervalList(h);

	for (Transcript t : gene)
		for (Exon e: t.exons)
			intervalList.add(new Interval (gene.getContig(), e.start, e.end, gene.isNegativeStrand(), gene.getName()));

	List<Interval> uniqueIntervals = IntervalList.getUniqueIntervals(intervalList, false);

	// track aggregated GC.
	GCResult result = new GCResult(0, 0, 0);

	for (Interval i: uniqueIntervals) {
		GCResult gcResultInterval = calculateGCContentExon(i, fastaRef);
		result.increment(gcResultInterval);
	}
	return result;
}
 
Example 6
Source File: HaplotypeBAMDestination.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a new HaplotypeBAMDestination
 *
 * @param sourceHeader SAMFileHeader used to seed the output SAMFileHeader for this destination.
 * @param haplotypeReadGroupID read group ID used when writing haplotypes as reads
 */
protected HaplotypeBAMDestination(SAMFileHeader sourceHeader, final String haplotypeReadGroupID) {
    Utils.nonNull(sourceHeader, "sourceHeader cannot be null");
    Utils.nonNull(haplotypeReadGroupID, "haplotypeReadGroupID cannot be null");
    this.haplotypeReadGroupID = haplotypeReadGroupID;

    bamOutputHeader = new SAMFileHeader();
    bamOutputHeader.setSequenceDictionary(sourceHeader.getSequenceDictionary());
    bamOutputHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);

    final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
    readGroups.addAll(sourceHeader.getReadGroups()); // include the original read groups

    // plus an artificial read group for the haplotypes
    final SAMReadGroupRecord rgRec = new SAMReadGroupRecord(getHaplotypeReadGroupID());
    rgRec.setSample(haplotypeSampleTag);
    rgRec.setSequencingCenter("BI");
    readGroups.add(rgRec);
    bamOutputHeader.setReadGroups(readGroups);
    final List<SAMProgramRecord> programRecords = new ArrayList<>(sourceHeader.getProgramRecords());
    programRecords.add(new SAMProgramRecord("HaplotypeBAMWriter"));
    bamOutputHeader.setProgramRecords(programRecords);
}
 
Example 7
Source File: FastqToSam.java    From picard with MIT License 6 votes vote down vote up
/** Creates a simple header with the values provided on the command line. */
public SAMFileHeader createSamFileHeader() {
    final SAMReadGroupRecord rgroup = new SAMReadGroupRecord(this.READ_GROUP_NAME);
    rgroup.setSample(this.SAMPLE_NAME);
    if (this.LIBRARY_NAME != null) rgroup.setLibrary(this.LIBRARY_NAME);
    if (this.PLATFORM != null) rgroup.setPlatform(this.PLATFORM);
    if (this.PLATFORM_UNIT != null) rgroup.setPlatformUnit(this.PLATFORM_UNIT);
    if (this.SEQUENCING_CENTER != null) rgroup.setSequencingCenter(SEQUENCING_CENTER);
    if (this.PREDICTED_INSERT_SIZE != null) rgroup.setPredictedMedianInsertSize(PREDICTED_INSERT_SIZE);
    if (this.DESCRIPTION != null) rgroup.setDescription(this.DESCRIPTION);
    if (this.RUN_DATE != null) rgroup.setRunDate(this.RUN_DATE);
    if (this.PLATFORM_MODEL != null) rgroup.setPlatformModel(this.PLATFORM_MODEL);
    if (this.PROGRAM_GROUP != null) rgroup.setProgramGroup(this.PROGRAM_GROUP);

    final SAMFileHeader header = new SAMFileHeader();
    header.addReadGroup(rgroup);

    for (final String comment : COMMENT) {
        header.addComment(comment);
    }

    header.setSortOrder(this.SORT_ORDER);
    return header ;
}
 
Example 8
Source File: PathSeqFilterSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {

    filterArgs.doReadFilterArgumentWarnings(getCommandLineParser().getPluginDescriptor(GATKReadFilterPluginDescriptor.class), logger);
    final SAMFileHeader header = PSUtils.checkAndClearHeaderSequences(getHeaderForReads(), filterArgs, logger);

    final JavaRDD<GATKRead> reads = getReads();
    final PSFilter filter = new PSFilter(ctx, filterArgs, header);
    final Tuple2<JavaRDD<GATKRead>, JavaRDD<GATKRead>> filterResult;
    try (final PSFilterLogger filterLogger = filterArgs.filterMetricsFileUri != null ? new PSFilterFileLogger(getMetricsFile(), filterArgs.filterMetricsFileUri) : new PSFilterEmptyLogger()) {
        filterResult = filter.doFilter(reads, filterLogger);
    }
    final JavaRDD<GATKRead> pairedReads = filterResult._1;
    final JavaRDD<GATKRead> unpairedReads = filterResult._2;

    if (!pairedReads.isEmpty()) {
        header.setSortOrder(SAMFileHeader.SortOrder.queryname);
        writeReads(ctx, outputPaired, pairedReads, header, true);
    } else {
        logger.info("No paired reads to write - BAM will not be written.");
    }
    if (!unpairedReads.isEmpty()) {
        header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
        writeReads(ctx, outputUnpaired, unpairedReads, header, true);
    } else {
        logger.info("No unpaired reads to write - BAM will not be written.");
    }
    filter.close();
}
 
Example 9
Source File: SparkUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Sort reads into queryname order if they are not already sorted
 */
public static JavaRDD<GATKRead> querynameSortReadsIfNecessary(JavaRDD<GATKRead> reads, int numReducers, SAMFileHeader header) {
    JavaRDD<GATKRead> sortedReadsForMarking;
    if (ReadUtils.isReadNameGroupedBam(header)) {
        sortedReadsForMarking = reads;
    } else {
        header.setSortOrder(SAMFileHeader.SortOrder.queryname);
        sortedReadsForMarking = sortReadsAccordingToHeader(reads, header, numReducers);
    }
    return sortedReadsForMarking;
}
 
Example 10
Source File: SortSamSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    final JavaRDD<GATKRead> reads = getReads();
    final int numReducers = getRecommendedNumReducers();
    logger.info("Using {} reducers", numReducers);

    final SAMFileHeader header = getHeaderForReads();
    header.setSortOrder(sortOrder.getSamOrder());

    writeReads(ctx, outputFile, reads, header, true);
}
 
Example 11
Source File: TagReadWithInterval.java    From Drop-seq with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
	IOUtil.assertFileIsReadable(INPUT);
	IOUtil.assertFileIsWritable(OUTPUT);
	SamReader inputSam = SamReaderFactory.makeDefault().open(INPUT);
	SAMFileHeader header = inputSam.getFileHeader();
       header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
	SamHeaderUtil.addPgRecord(header, this);

	SAMFileWriter writer= new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, OUTPUT);

	IntervalList loci = IntervalList.fromFile(this.INTERVALS);
	OverlapDetector<Interval> od =getOverlapDetector (loci);
	ProgressLogger processLogger = new ProgressLogger(log);

	for (SAMRecord record: inputSam) {
		processLogger.record(record);

		if (record.getReadUnmappedFlag()==false)
			// use alignment blocks instead of start/end to properly deal with split reads mapped over exon/exon boundaries.
			record=tagRead(record, od);
		else
			record.setAttribute(this.TAG, null);
		writer.addAlignment(record);

	}

	CloserUtil.close(inputSam);
	writer.close();

	return(0);


}
 
Example 12
Source File: MergeSamFiles.java    From picard with MIT License 4 votes vote down vote up
/** Combines multiple SAM/BAM files into one. */
@Override
protected int doWork() {
    boolean matchedSortOrders = true;
    
    // read interval list if it is defined
    final List<Interval> intervalList = (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals() );
    // map reader->iterator used if INTERVALS is defined
    final Map<SamReader, CloseableIterator<SAMRecord> > samReaderToIterator = new HashMap<>(INPUT.size());

    final List<Path> inputPaths = IOUtil.getPaths(INPUT);

    // Open the files for reading and writing
    final List<SamReader> readers = new ArrayList<>();
    final List<SAMFileHeader> headers = new ArrayList<>();
    {
        SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory

        for (final Path inFile : inputPaths) {
            IOUtil.assertFileIsReadable(inFile);
            final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile);
            if (INTERVALS != null) {
                if (!in.hasIndex()) {
                    throw new PicardException("Merging with interval but BAM file is not indexed: " + inFile);
                }
                final CloseableIterator<SAMRecord> samIterator = new SamRecordIntervalIteratorFactory().makeSamRecordIntervalIterator(in, intervalList, true);
                samReaderToIterator.put(in, samIterator);
            }

            readers.add(in);
            headers.add(in.getFileHeader());

            // A slightly hackish attempt to keep memory consumption down when merging multiple files with
            // large sequence dictionaries (10,000s of sequences). If the dictionaries are identical, then
            // replace the duplicate copies with a single dictionary to reduce the memory footprint. 
            if (dict == null) {
                dict = in.getFileHeader().getSequenceDictionary();
            } else if (dict.equals(in.getFileHeader().getSequenceDictionary())) {
                in.getFileHeader().setSequenceDictionary(dict);
            }

            matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
        }
    }

    // If all the input sort orders match the output sort order then just merge them and
    // write on the fly, otherwise setup to merge and sort before writing out the final file
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean presorted;
    final SAMFileHeader.SortOrder headerMergerSortOrder;
    final boolean mergingSamRecordIteratorAssumeSorted;

    if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted || ASSUME_SORTED || INTERVALS != null ) {
        log.info("Input files are in same order as output so sorting to temp directory is not needed.");
        headerMergerSortOrder = SORT_ORDER;
        mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED;
        presorted = true;
    } else {
        log.info("Sorting input files using temp directory " + TMP_DIR);
        headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted;
        mergingSamRecordIteratorAssumeSorted = false;
        presorted = false;
    }
    final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES);
    final MergingSamRecordIterator iterator;
    // no interval defined, get an iterator for the whole bam
    if (intervalList == null) {
        iterator = new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted);
    } else {
        // show warning related to https://github.com/broadinstitute/picard/pull/314/files
        log.info("Warning: merged bams from different interval lists may contain the same read in both files");
        iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true);
    }
    final SAMFileHeader header = headerMerger.getMergedHeader();
    for (final String comment : COMMENT) {
        header.addComment(comment);
    }
    header.setSortOrder(SORT_ORDER);
    final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
    if (USE_THREADING) {
        samFileWriterFactory.setUseAsyncIo(true);
    }
    final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT);

    // Lastly loop through and write out the records
    final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL);
    while (iterator.hasNext()) {
        final SAMRecord record = iterator.next();
        out.addAlignment(record);
        progress.record(record);
    }

    log.info("Finished reading inputs.");
    for(final CloseableIterator<SAMRecord> iter : samReaderToIterator.values())  CloserUtil.close(iter);
    CloserUtil.close(readers);
    out.close();
    return 0;
}
 
Example 13
Source File: Evidence2SAM.java    From cramtools with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException, ParseException {
	EvidenceRecordFileIterator iterator = new EvidenceRecordFileIterator(new File(args[0]));
	Read context = new Read();
	SAMFileHeader header = new SAMFileHeader();
	header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
	SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord("chr10", 135534747);
	samSequenceRecord.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, iterator.assembly_ID);
	String readGroup = String.format("%s-%s", iterator.assembly_ID, iterator.chromosome);
	SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(readGroup);
	readGroupRecord.setAttribute(SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG, iterator.sample);
	readGroupRecord.setAttribute(SAMReadGroupRecord.PLATFORM_UNIT_TAG, readGroup);
	readGroupRecord.setAttribute(SAMReadGroupRecord.SEQUENCING_CENTER_TAG, "\"Complete Genomics\"");
	Date date = new SimpleDateFormat("yyyy-MMM-dd hh:mm:ss.S").parse(iterator.generatedAt);
	readGroupRecord.setAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG,
			new SimpleDateFormat("yyyy-MM-dd").format(date));
	readGroupRecord.setAttribute(SAMReadGroupRecord.PLATFORM_TAG, "\"Complete Genomics\"");
	header.addReadGroup(readGroupRecord);

	header.addSequence(samSequenceRecord);

	SAMFileWriterFactory f = new SAMFileWriterFactory();
	SAMFileWriter samWriter;
	if (args.length > 1)
		samWriter = f.makeBAMWriter(header, false, new File(args[1]));
	else
		samWriter = f.makeSAMWriter(header, false, System.out);
	int i = 0;
	long time = System.currentTimeMillis();
	DedupIterator dedupIt = new DedupIterator(iterator);

	while (dedupIt.hasNext()) {
		EvidenceRecord evidenceRecord = dedupIt.next();
		if (evidenceRecord == null)
			throw new RuntimeException();
		try {
			context.reset(evidenceRecord);
			context.parse();
		} catch (Exception e) {
			System.err.println("Failed on line:");
			System.err.println(evidenceRecord.line);
			throw new RuntimeException(e);
		}

		SAMRecord[] samRecords = context.toSAMRecord(header);
		for (SAMRecord samRecord : samRecords) {
			samRecord.setAttribute(SAMTag.RG.name(), readGroup);
			samWriter.addAlignment(samRecord);
		}

		i++;
		if (i % 1000 == 0) {
			if (System.currentTimeMillis() - time > 10 * 1000) {
				time = System.currentTimeMillis();
				System.err.println(i);
			}
		}

		if (i > 10000)
			break;
	}
	samWriter.close();
}
 
Example 14
Source File: BedToIntervalList.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
    IOUtil.assertFileIsWritable(OUTPUT);
    try {
        // create a new header that we will assign the dictionary provided by the SAMSequenceDictionaryExtractor to.
        final SAMFileHeader header = new SAMFileHeader();
        final SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary(SEQUENCE_DICTIONARY.toPath());
        header.setSequenceDictionary(samSequenceDictionary);
        // set the sort order to be sorted by coordinate, which is actually done below
        // by getting the .uniqued() intervals list before we write out the file
        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        final IntervalList intervalList = new IntervalList(header);

        final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(), false);
        final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
        final ProgressLogger progressLogger = new ProgressLogger(LOG, (int) 1e6);

        while (iterator.hasNext()) {
            final BEDFeature bedFeature = iterator.next();
            final String sequenceName = bedFeature.getContig();
            final int start = bedFeature.getStart();
            final int end = bedFeature.getEnd();
            // NB: do not use an empty name within an interval
            final String name;
            if (bedFeature.getName().isEmpty()) {
                name = null;
            } else {
                name = bedFeature.getName();
            }

            final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);

            // Do some validation
            if (null == sequenceRecord) {
                if (DROP_MISSING_CONTIGS) {
                    LOG.info(String.format("Dropping interval with missing contig: %s:%d-%d", sequenceName, bedFeature.getStart(), bedFeature.getEnd()));
                    missingIntervals++;
                    missingRegion += bedFeature.getEnd() - bedFeature.getStart();
                    continue;
                }
                throw new PicardException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
            } else if (start < 1) {
                throw new PicardException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
            } else if (sequenceRecord.getSequenceLength() < start) {
                throw new PicardException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
            } else if ((end == 0 && start != 1 ) //special case for 0-length interval at the start of a contig
                    || end < 0 ) {
                throw new PicardException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
            } else if (sequenceRecord.getSequenceLength() < end) {
                throw new PicardException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
            } else if (end < start - 1) {
                throw new PicardException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
            }

            final boolean isNegativeStrand = bedFeature.getStrand() == Strand.NEGATIVE;
            final Interval interval = new Interval(sequenceName, start, end, isNegativeStrand, name);
            intervalList.add(interval);

            progressLogger.record(sequenceName, start);
        }
        CloserUtil.close(bedReader);

        if (DROP_MISSING_CONTIGS) {
            if (missingRegion == 0) {
                LOG.info("There were no missing regions.");
            } else {
                LOG.warn(String.format("There were %d missing regions with a total of %d bases", missingIntervals, missingRegion));
            }
        }
        // Sort and write the output
        IntervalList out = intervalList;
        if (SORT) {
            out = out.sorted();
        }
        if (UNIQUE) {
            out = out.uniqued();
        }
        out.write(OUTPUT);
        LOG.info(String.format("Wrote %d intervals spanning a total of %d bases", out.getIntervals().size(),out.getBaseCount()));

    } catch (final IOException e) {
        throw new RuntimeException(e);
    }

    return 0;
}
 
Example 15
Source File: MergeBamAlignmentTest.java    From picard with MIT License 4 votes vote down vote up
/**
 * Read a single paired-end read from a file, and create one or more aligned records for the read pair based on
 * the lists, merge with the original paired-end read, and assert expected results.
 * @param description
 * @param firstOfPair List that describes the aligned SAMRecords to create for the first end.
 * @param secondOfPair List that describes the aligned SAMRecords to create for the second end.
 * @param expectedPrimaryHitIndex Expected value for the HI tag in the primary alignment in the merged output.
 * @param expectedNumFirst Expected number of first-of-pair SAMRecords in the merged output.
 * @param expectedNumSecond Expected number of second-of-pair SAMRecords in the merged output.
 * @param expectedPrimaryMapq Sum of mapqs of both ends of primary alignment in the merged output.
 * @throws Exception
 */
@Test(dataProvider = "testPairedMultiHitWithFilteringTestCases")
public void testPairedMultiHitWithFiltering(final String description, final List<HitSpec> firstOfPair, final List<HitSpec> secondOfPair,
                                            final Integer expectedPrimaryHitIndex, final int expectedNumFirst,
                                            final int expectedNumSecond, final int expectedPrimaryMapq) throws Exception {

    // Create the aligned file by copying bases, quals, readname from the unmapped read, and conforming to each HitSpec.
    final File unmappedSam = new File(TEST_DATA_DIR, "multihit.filter.unmapped.sam");
    final SAMRecordIterator unmappedSamFileIterator = SamReaderFactory.makeDefault().open(unmappedSam).iterator();
    final SAMRecord firstUnmappedRec = unmappedSamFileIterator.next();
    final SAMRecord secondUnmappedRec = unmappedSamFileIterator.next();
    unmappedSamFileIterator.close();
    final File alignedSam = File.createTempFile("aligned.", ".sam");
    alignedSam.deleteOnExit();
    final SAMFileHeader alignedHeader = new SAMFileHeader();
    alignedHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
    alignedHeader.setSequenceDictionary(SamReaderFactory.makeDefault().getFileHeader(sequenceDict).getSequenceDictionary());
    final SAMFileWriter alignedWriter = new SAMFileWriterFactory().makeSAMWriter(alignedHeader, true, alignedSam);
    for (int i = 0; i < Math.max(firstOfPair.size(), secondOfPair.size()); ++i) {
        final HitSpec firstHitSpec = firstOfPair.isEmpty()? null: firstOfPair.get(i);
        final HitSpec secondHitSpec = secondOfPair.isEmpty()? null: secondOfPair.get(i);
        final SAMRecord first = makeRead(alignedHeader, firstUnmappedRec, firstHitSpec, true, i);
        final SAMRecord second = makeRead(alignedHeader, secondUnmappedRec, secondHitSpec, false, i);
        if (first != null && second != null) SamPairUtil.setMateInfo(first, second);
        if (first != null) {
            if (second == null) first.setMateUnmappedFlag(true);
            alignedWriter.addAlignment(first);
        }
        if (second != null) {
            if (first == null) second.setMateUnmappedFlag(true);
            alignedWriter.addAlignment(second);
        }
    }
    alignedWriter.close();

    // Merge aligned file with original unmapped file.
    final File mergedSam = File.createTempFile("merged.", ".sam");
    mergedSam.deleteOnExit();

    doMergeAlignment(unmappedSam, Collections.singletonList(alignedSam),
            null, null, null, null,
            false, true, false, 1,
            "0", "1.0", "align!", "myAligner",
            true, fasta, mergedSam,
            null, null, null, null, null, null);

    assertSamValid(mergedSam);

    // Tally metrics and check for agreement with expected.
    final SamReader mergedReader = SamReaderFactory.makeDefault().open(mergedSam);
    int numFirst = 0;
    int numSecond = 0;
    Integer primaryHitIndex = null;
    int primaryMapq = 0;
    for (final SAMRecord rec : mergedReader) {
        if (rec.getFirstOfPairFlag()) ++numFirst;
        if (rec.getSecondOfPairFlag()) ++numSecond;
        if (!rec.getNotPrimaryAlignmentFlag()  && !rec.getReadUnmappedFlag()) {
            final Integer hitIndex = rec.getIntegerAttribute(SAMTag.HI.name());
            final int newHitIndex = (hitIndex == null? -1: hitIndex);
            if (primaryHitIndex == null) primaryHitIndex = newHitIndex;
            else Assert.assertEquals(newHitIndex, primaryHitIndex.intValue());
            primaryMapq += rec.getMappingQuality();
        }
    }
    Assert.assertEquals(numFirst, expectedNumFirst);
    Assert.assertEquals(numSecond, expectedNumSecond);
    Assert.assertEquals(primaryHitIndex, expectedPrimaryHitIndex);
    Assert.assertEquals(primaryMapq, expectedPrimaryMapq);
}
 
Example 16
Source File: ReadsPipelineSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
    List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

    final JavaRDD<GATKRead> alignedReads;
    final SAMFileHeader header;
    final BwaSparkEngine bwaEngine;
    if (align) {
        bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary());
        if (bwaArgs.singleEndAlignment) {
            alignedReads = bwaEngine.alignUnpaired(getReads());
        } else {
            // filter reads after alignment in the case of paired reads since filtering does not know about pairs
            final ReadFilter filter = makeReadFilter(bwaEngine.getHeader());
            alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test);
        }
        header = bwaEngine.getHeader();
    } else {
        bwaEngine = null;
        alignedReads = getReads();
        header = getHeaderForReads();
    }

    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

    // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource
    final SAMFileHeader readsHeader = header.clone();
    readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    final JavaRDD<GATKRead> sortedMarkedReads = SparkUtils.sortReadsAccordingToHeader(markedReads, readsHeader, numReducers);

    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), header);

    JavaRDD<GATKRead> markedFilteredReadsForBQSR = sortedMarkedReads.filter(bqsrReadFilter::test);

    JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(markedFilteredReadsForBQSR, localKnownSitesFilePaths);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(sortedMarkedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

    if (outputBam != null) { // only write output of BQSR if output BAM is specified
        writeReads(ctx, outputBam, finalReads, header, true);
    }

    // Run Haplotype Caller
    final ReadFilter hcReadFilter = ReadFilter.fromList(HaplotypeCallerEngine.makeStandardHCReadFilters(), header);
    final JavaRDD<GATKRead> filteredReadsForHC = finalReads.filter(hcReadFilter::test);
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    final List<SimpleInterval> intervals = hasUserSuppliedIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);

    List<ShardBoundary> intervalShards = intervals.stream()
            .flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardingArgs.readShardSize, shardingArgs.readShardPadding, sequenceDictionary).stream())
            .collect(Collectors.toList());

    HaplotypeCallerSpark.callVariantsWithHaplotypeCallerAndWriteOutput(ctx, filteredReadsForHC, readsHeader, sequenceDictionary, referenceArguments.getReferenceFileName(), intervalShards, hcArgs, shardingArgs, assemblyRegionArgs, output, makeVariantAnnotations(), logger, strict, createOutputVariantIndex);

    if (bwaEngine != null) {
        bwaEngine.close();
    }
}
 
Example 17
Source File: MergeBamAlignmentTest.java    From picard with MIT License 4 votes vote down vote up
/**
 * Confirm that paired reads are rejected by PrimaryAlignmentStrategy.EarliestFragment.
 */
@Test(expectedExceptions = UnsupportedOperationException.class)
public void testEarliestFragmentStrategyPaired() throws Exception {
    final File output = File.createTempFile("mergeTest", ".sam");
    output.deleteOnExit();

    final File unmappedSam = File.createTempFile("unmapped.", ".sam");
    unmappedSam.deleteOnExit();
    final SAMFileWriterFactory factory = new SAMFileWriterFactory();
    final SAMFileHeader header = new SAMFileHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);
    final String cigar = "16M";

    final SAMRecord firstOfPair = new SAMRecord(header);
    firstOfPair.setReadName("theRead");
    firstOfPair.setReadString("ACGTACGTACGTACGT");
    firstOfPair.setBaseQualityString("5555555555555555");
    firstOfPair.setReadUnmappedFlag(true);
    firstOfPair.setReadPairedFlag(true);
    firstOfPair.setFirstOfPairFlag(true);

    final SAMRecord secondOfPair = new SAMRecord(header);
    secondOfPair.setReadName("theRead");
    secondOfPair.setReadString("ACGTACGTACGTACGT");
    secondOfPair.setBaseQualityString("5555555555555555");
    secondOfPair.setReadUnmappedFlag(true);
    secondOfPair.setReadPairedFlag(true);
    secondOfPair.setSecondOfPairFlag(true);
    SamPairUtil.setMateInfo(firstOfPair, secondOfPair);

    final SAMFileWriter unmappedWriter = factory.makeSAMWriter(header, false, unmappedSam);
    unmappedWriter.addAlignment(firstOfPair);
    unmappedWriter.addAlignment(secondOfPair);
    unmappedWriter.close();

    final File alignedSam = File.createTempFile("aligned.", ".sam");
    alignedSam.deleteOnExit();

    // Populate the header with SAMSequenceRecords
    header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(sequenceDict2.toPath()));

    // Create 2 alignments for each end of pair
    final SAMFileWriter alignedWriter = factory.makeSAMWriter(header, false, alignedSam);
    for (int i = 1; i <= 2; ++i) {
        final SAMRecord firstOfPairAligned = new SAMRecord(header);
        firstOfPairAligned.setReadName(firstOfPair.getReadName());
        firstOfPairAligned.setReadBases(firstOfPair.getReadBases());
        firstOfPairAligned.setBaseQualities(firstOfPair.getBaseQualities());
        firstOfPairAligned.setReferenceName("chr1");
        firstOfPairAligned.setAlignmentStart(i);
        firstOfPairAligned.setCigarString(cigar);
        firstOfPairAligned.setMappingQuality(100);
        firstOfPairAligned.setReadPairedFlag(true);
        firstOfPairAligned.setFirstOfPairFlag(true);
        firstOfPairAligned.setAttribute(SAMTag.HI.name(), i);

        final SAMRecord secondOfPairAligned = new SAMRecord(header);
        secondOfPairAligned.setReadName(secondOfPair.getReadName());
        secondOfPairAligned.setReadBases(secondOfPair.getReadBases());
        secondOfPairAligned.setBaseQualities(secondOfPair.getBaseQualities());
        secondOfPairAligned.setReferenceName("chr1");
        secondOfPairAligned.setAlignmentStart(i + 10);
        secondOfPairAligned.setCigarString(cigar);
        secondOfPairAligned.setMappingQuality(100);
        secondOfPairAligned.setReadPairedFlag(true);
        secondOfPairAligned.setSecondOfPairFlag(true);
        secondOfPairAligned.setAttribute(SAMTag.HI.name(), i);

        SamPairUtil.setMateInfo(firstOfPairAligned, secondOfPairAligned);

        alignedWriter.addAlignment(firstOfPairAligned);
        alignedWriter.addAlignment(secondOfPairAligned);
    }
    alignedWriter.close();

    doMergeAlignment(unmappedSam, Collections.singletonList(alignedSam),
            null, null, null, null,
            false, true, false, 1,
            "0", "1.0", "align!", "myAligner",
            true,
            fasta, output,
            SamPairUtil.PairOrientation.FR,
            MergeBamAlignment.PrimaryAlignmentStrategy.EarliestFragment,
            null, null, null, null);

    Assert.fail("Exception was not thrown");
}
 
Example 18
Source File: MergeBamAlignmentTest.java    From picard with MIT License 4 votes vote down vote up
/**
 * @return a 2-element array in which the first element is the unmapped SAM, and the second the mapped SAM.
 */
private File[] createSamFilesToBeMerged(final MultipleAlignmentSpec[] specs) {
    try {
        final File unmappedSam = File.createTempFile("unmapped.", ".sam");
        unmappedSam.deleteOnExit();
        final SAMFileWriterFactory factory = new SAMFileWriterFactory();
        final SAMFileHeader header = new SAMFileHeader();
        header.setSortOrder(SAMFileHeader.SortOrder.queryname);
        final SAMRecord unmappedRecord = new SAMRecord(header);

        unmappedRecord.setReadName("theRead");
        unmappedRecord.setReadString("ACGTACGTACGTACGT");
        unmappedRecord.setBaseQualityString("5555555555555555");
        unmappedRecord.setReadUnmappedFlag(true);

        final SAMFileWriter unmappedWriter = factory.makeSAMWriter(header, false, unmappedSam);
        unmappedWriter.addAlignment(unmappedRecord);
        unmappedWriter.close();

        final File alignedSam = File.createTempFile("aligned.", ".sam");
        alignedSam.deleteOnExit();

        final String sequence = "chr1";
        // Populate the header with SAMSequenceRecords
        header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(sequenceDict2.toPath()));

        final SAMFileWriter alignedWriter = factory.makeSAMWriter(header, false, alignedSam);
        for (final MultipleAlignmentSpec spec : specs) {
            final SAMRecord alignedRecord = new SAMRecord(header);
            alignedRecord.setReadName(unmappedRecord.getReadName());
            alignedRecord.setReadBases(unmappedRecord.getReadBases());
            alignedRecord.setBaseQualities(unmappedRecord.getBaseQualities());
            alignedRecord.setReferenceName(sequence);
            alignedRecord.setAlignmentStart(1);
            alignedRecord.setReadNegativeStrandFlag(spec.reverseStrand);
            alignedRecord.setCigarString(spec.cigar);
            alignedRecord.setMappingQuality(spec.mapQ);
            if (spec.oneOfTheBest) {
                alignedRecord.setAttribute(ONE_OF_THE_BEST_TAG, 1);
            }
            alignedWriter.addAlignment(alignedRecord);
        }
        alignedWriter.close();

        return new File[]{unmappedSam, alignedSam};
    } catch (IOException e) {
        throw new PicardException(e.getMessage(), e);
    }
}
 
Example 19
Source File: MergeBamAlignmentTest.java    From picard with MIT License 4 votes vote down vote up
private void testBestFragmentMapqStrategy(final String testName, final int[] firstMapQs, final int[] secondMapQs,
                                          final boolean includeSecondary, final int expectedFirstMapq,
                                          final int expectedSecondMapq) throws Exception {
    final File unmappedSam = File.createTempFile("unmapped.", ".sam");
    unmappedSam.deleteOnExit();
    final SAMFileWriterFactory factory = new SAMFileWriterFactory();
    final SAMFileHeader header = new SAMFileHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);

    final String readName = "theRead";
    final SAMRecord firstUnmappedRead = new SAMRecord(header);
    firstUnmappedRead.setReadName(readName);
    firstUnmappedRead.setReadString("ACGTACGTACGTACGT");
    firstUnmappedRead.setBaseQualityString("5555555555555555");
    firstUnmappedRead.setReadUnmappedFlag(true);
    firstUnmappedRead.setMateUnmappedFlag(true);
    firstUnmappedRead.setReadPairedFlag(true);
    firstUnmappedRead.setFirstOfPairFlag(true);

    final SAMRecord secondUnmappedRead = new SAMRecord(header);
    secondUnmappedRead.setReadName(readName);
    secondUnmappedRead.setReadString("TCGAACGTTCGAACTG");
    secondUnmappedRead.setBaseQualityString("6666666666666666");
    secondUnmappedRead.setReadUnmappedFlag(true);
    secondUnmappedRead.setMateUnmappedFlag(true);
    secondUnmappedRead.setReadPairedFlag(true);
    secondUnmappedRead.setSecondOfPairFlag(true);

    final SAMFileWriter unmappedWriter = factory.makeSAMWriter(header, false, unmappedSam);
    unmappedWriter.addAlignment(firstUnmappedRead);
    unmappedWriter.addAlignment(secondUnmappedRead);
    unmappedWriter.close();

    final File alignedSam = File.createTempFile("aligned.", ".sam");
    alignedSam.deleteOnExit();

    final String sequence = "chr1";
    // Populate the header with SAMSequenceRecords
    header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(sequenceDict2.toPath()));

    final SAMFileWriter alignedWriter = factory.makeSAMWriter(header, false, alignedSam);

    addAlignmentsForBestFragmentMapqStrategy(alignedWriter, firstUnmappedRead, sequence, firstMapQs);
    addAlignmentsForBestFragmentMapqStrategy(alignedWriter, secondUnmappedRead, sequence, secondMapQs);
    alignedWriter.close();

    final File output = File.createTempFile("testBestFragmentMapqStrategy." + testName, ".sam");
    output.deleteOnExit();
    doMergeAlignment(unmappedSam, Collections.singletonList(alignedSam),
            null, null, null, null,
            false, true, false, 1,
            "0", "1.0", "align!", "myAligner",
            true,
            fasta, output,
            SamPairUtil.PairOrientation.FR,
            MergeBamAlignment.PrimaryAlignmentStrategy.BestEndMapq,
            null, includeSecondary, null, null);
    final SamReader reader = SamReaderFactory.makeDefault().open(output);

    int numFirstRecords = 0;
    int numSecondRecords = 0;
    int firstPrimaryMapq = -1;
    int secondPrimaryMapq = -1;
    for (final SAMRecord rec: reader) {
        Assert.assertTrue(rec.getReadPairedFlag());
        if (rec.getFirstOfPairFlag()) ++numFirstRecords;
        else if (rec.getSecondOfPairFlag()) ++ numSecondRecords;
        else Assert.fail("unpossible!");
        if (!rec.getReadUnmappedFlag() && !rec.getNotPrimaryAlignmentFlag()) {
            if (rec.getFirstOfPairFlag()) {
                Assert.assertEquals(firstPrimaryMapq, -1);
                firstPrimaryMapq = rec.getMappingQuality();
            } else {
                Assert.assertEquals(secondPrimaryMapq, -1);
                secondPrimaryMapq = rec.getMappingQuality();
            }
        } else if (rec.getNotPrimaryAlignmentFlag()) {
            Assert.assertTrue(rec.getMateUnmappedFlag());
        }
    }
    reader.close();
    Assert.assertEquals(firstPrimaryMapq, expectedFirstMapq);
    Assert.assertEquals(secondPrimaryMapq, expectedSecondMapq);
    if (!includeSecondary) {
        Assert.assertEquals(numFirstRecords, 1);
        Assert.assertEquals(numSecondRecords, 1);
    } else {
        // If no alignments for an end, there will be a single unmapped record
        Assert.assertEquals(numFirstRecords, Math.max(1, firstMapQs.length));
        Assert.assertEquals(numSecondRecords, Math.max(1, secondMapQs.length));
    }
}
 
Example 20
Source File: SingleCellRnaSeqMetricsCollector.java    From Drop-seq with MIT License 4 votes vote down vote up
/**
    * Sets up the reads in cell barcode order.
    * Only adds reads that pass the map quality and are in the set of cell barcodes requested.
    *
    * I've tried adapting this to the TagOrderIterator API, but it seems like I need to add the read groups to the header of the temporary BAM that gets
    * iterated on or this doesn't work.
    */
   private CloseableIterator<SAMRecord> getReadsInTagOrder (final File bamFile, final String primaryTag,
                                                            final List<SAMReadGroupRecord> rg,
                                                            final List<String> allCellBarcodes, final int mapQuality) {

	SamReader reader = SamReaderFactory.makeDefault().open(bamFile);
	SAMSequenceDictionary dict= reader.getFileHeader().getSequenceDictionary();
	List<SAMProgramRecord> programs =reader.getFileHeader().getProgramRecords();

	final Set<String> cellBarcodeSet = new HashSet<> (allCellBarcodes);

	final SAMFileHeader writerHeader = new SAMFileHeader();
	// reader.getFileHeader().setReadGroups(rg);
	for (SAMReadGroupRecord z: rg) {
		reader.getFileHeader().addReadGroup(z);
		writerHeader.addReadGroup(z);
	}
       writerHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
       writerHeader.setSequenceDictionary(dict);
       for (SAMProgramRecord spr : programs)
		writerHeader.addProgramRecord(spr);

       // This not only filters, but sets the RG attribute on reads it allows through.
       final FilteredIterator<SAMRecord> rgAddingFilter = new FilteredIterator<SAMRecord>(reader.iterator()) {
           @Override
           public boolean filterOut(final SAMRecord r) {
               String cellBarcode = r.getStringAttribute(primaryTag);
               if (cellBarcodeSet.contains(cellBarcode) & r.getMappingQuality() >= mapQuality) {
                   r.setAttribute("RG", cellBarcode);
                   return false;
               } else
				return true;
           }
       };

       ProgressLogger p = new ProgressLogger(log, 1000000, "Preparing reads in core barcodes");
       CloseableIterator<SAMRecord> sortedIterator = SamRecordSortingIteratorFactory.create(writerHeader, rgAddingFilter, new StringTagComparator(primaryTag), p);

	log.info("Sorting finished.");
	return (sortedIterator);
}