Java Code Examples for htsjdk.samtools.SAMSequenceDictionary#getSequences()

The following examples show how to use htsjdk.samtools.SAMSequenceDictionary#getSequences() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 6 votes vote down vote up
private void processByPartialContig (final ReferenceSequenceFile ref, final FastaSequenceFileWriter writer, final File intervalListFile) {
	SAMSequenceDictionary sd = ref.getSequenceDictionary();
	// validate that the intervals and the reference have the same sequence dictionary.
	IntervalList iList = IntervalList.fromFile(intervalListFile);
	iList.getHeader().getSequenceDictionary().assertSameDictionary(sd);
	// map the intervals to a map to each contig.
	Map<String, List<Interval>> intervalsPerContig = getIntervalsForContig(iList);

	for (SAMSequenceRecord r: sd.getSequences()) {
		String contig = r.getSequenceName();
		log.info("Processing partial contig " + contig);
		// this list can be null.
		List<Interval> intervalsToMask = intervalsPerContig.get(contig);
		ReferenceSequence rs = ref.getSequence(contig);
		writeSequence(rs, intervalsToMask, writer);
	}
}
 
Example 2
Source File: GenomeLocParser.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a genome loc parser based on seqDict with the specified level of validation
 * @param seqDict the sequence dictionary to use when creating genome locs
 * @param validationLevel how much validation should we do of the genome locs at runtime? Purely for testing purposes
 */
protected GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) {
    Utils.nonNull(validationLevel, "validation level cannot be null");
    if (seqDict == null) { // we couldn't load the reference dictionary
        //logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
        throw new CommandLineException("Failed to load reference dictionary");
    }

    this.validationLevel = validationLevel;
    this.contigInfo = new MRUCachingSAMSequenceDictionary(seqDict);
    if ( logger.isDebugEnabled() ) {
        logger.debug(String.format("Prepared reference sequence contig dictionary"));
        for (SAMSequenceRecord contig : seqDict.getSequences()) {
            logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
        }
    }
}
 
Example 3
Source File: RandomDNA.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Creates a random reference and writes it in FASTA format into a {@link Writer}.
 * @param out the output writer.
 * @param dict the dictionary indicating the number of contigs and their lengths.
 * @param basesPerLine number of base to print in each line of the output FASTA file.
 *
 * @throws IOException if such an exception was thrown while accessing and writing into the temporal file.
 * @throws IllegalArgumentException if {@code dict} is {@code null}, or {@code out } is {@code null}
 *    or {@code basesPerLine} is 0 or negative.
 */
public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final int basesPerLine)
        throws IOException {
    Utils.nonNull(out);
    Utils.nonNull(dict);
    ParamUtils.isPositive(basesPerLine, "number of base per line must be strictly positive: " + basesPerLine);
    final byte[] buffer = new byte[basesPerLine];
    final String lineSeparator = System.lineSeparator();
    for (final SAMSequenceRecord sequence : dict.getSequences()) {
        int pendingBases = sequence.getSequenceLength();
        out.append(">").append(sequence.getSequenceName()).append(lineSeparator);
        while (pendingBases > 0) {
            final int lineLength = pendingBases < basesPerLine ? pendingBases : basesPerLine;
            nextBases(buffer, 0, lineLength);
            out.append(new String(buffer, 0, lineLength)).append(lineSeparator);
            pendingBases -= lineLength;
        }
    }
}
 
Example 4
Source File: ChromosomeLengthFactory.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
@NotNull
public static List<ChromosomeLength> create(@NotNull final SAMSequenceDictionary dictionary) {
    final List<ChromosomeLength> results = Lists.newArrayList();

    for (final SAMSequenceRecord samSequenceRecord : dictionary.getSequences()) {
        final String sequenceName = samSequenceRecord.getSequenceName();
        if (HumanChromosome.contains(sequenceName)) {
            results.add(ImmutableChromosomeLength.builder()
                    .chromosome(sequenceName)
                    .length(samSequenceRecord.getSequenceLength())
                    .build());
        }
    }

    return results;
}
 
Example 5
Source File: SageApplication.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
private void run() throws InterruptedException, ExecutionException, IOException {

        long timeStamp = System.currentTimeMillis();

        final Map<String, QualityRecalibrationMap> recalibrationMap = qualityRecalibration();
        final SAMSequenceDictionary dictionary = dictionary();
        for (final SAMSequenceRecord samSequenceRecord : dictionary.getSequences()) {
            final String contig = samSequenceRecord.getSequenceName();
            if (config.chromosomes().isEmpty() || config.chromosomes().contains(contig)) {
                if (HumanChromosome.contains(contig) || MitochondrialChromosome.contains(contig)) {
                    try (final ChromosomePipeline pipeline = createChromosomePipeline(contig, recalibrationMap)) {
                        pipeline.process();
                    }
                    System.gc();
                }
            }
        }

//                createChromosomePipeline("10", recalibrationMap).process(130404941, 130405950);

        long timeTaken = System.currentTimeMillis() - timeStamp;
        LOGGER.info("Completed in {} seconds", timeTaken / 1000);
    }
 
Example 6
Source File: BAMRecordWriter.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
private void writeHeader(final SAMFileHeader header) {
	binaryCodec.writeBytes("BAM\001".getBytes(Charset.forName("UTF8")));

	final Writer sw = new StringWriter();
	new SAMTextHeaderCodec().encode(sw, header);

	binaryCodec.writeString(sw.toString(), true, false);

	final SAMSequenceDictionary dict = header.getSequenceDictionary();

	binaryCodec.writeInt(dict.size());
	for (final SAMSequenceRecord rec : dict.getSequences()) {
		binaryCodec.writeString(rec.getSequenceName(), true, true);
		binaryCodec.writeInt   (rec.getSequenceLength());
	}
}
 
Example 7
Source File: VcfHeader.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * Add contig lines corresponding to the sequences present in a SAM header.
 * @param header the SAM header.
 */
public void addContigFields(SAMFileHeader header) {
  final SAMSequenceDictionary dic =  header.getSequenceDictionary();
  for (final SAMSequenceRecord seq : dic.getSequences()) {
    final ContigField f = new ContigField(seq.getSequenceName(), seq.getSequenceLength());
    if (seq.getAttribute(SAMSequenceRecord.ASSEMBLY_TAG) != null) {
      f.put("as", seq.getAttribute(SAMSequenceRecord.ASSEMBLY_TAG));
    }
    if (seq.getAttribute(SAMSequenceRecord.MD5_TAG) != null) {
      f.put("md5", seq.getAttribute(SAMSequenceRecord.MD5_TAG));
    }
    if (seq.getAttribute(SAMSequenceRecord.SPECIES_TAG) != null) {
      f.put("species", seq.getAttribute(SAMSequenceRecord.SPECIES_TAG));
    }
    addContigField(f);
  }
}
 
Example 8
Source File: SVReferenceUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Create an RDD from the reference sequences.
 * The reference sequences are transformed into a single, large collection of byte arrays.
 * The collection is then parallelized into an RDD.
 * Each contig that exceeds a size given by {@code refRecordLen} is broken into a series of {@code refRecordLen}
 * chunks with a {@code kSize} - 1 base overlap between successive chunks.
 * (I.e., for {@code kSize} = 63, the last 62 bases in chunk n match the first 62 bases in chunk n+1)
 * so that we don't miss any kmers due to the chunking -- we can just kmerize each record independently.
 */
public static JavaRDD<byte[]> getReferenceBasesRDD(final JavaSparkContext ctx,
                                                   final int kSize,
                                                   final ReferenceMultiSparkSource ref,
                                                   final SAMSequenceDictionary dict,
                                                   final int refRecordLen,
                                                   final int refRecordsPerPartition) {
    Utils.nonNull(dict, "provided dictionary is null");
    Utils.validateArg(kSize!=0, "provided kmer size is zero");
    Utils.validateArg(refRecordLen > 0, "provided ref record length is non positive + " + refRecordLen);
    Utils.validateArg(refRecordsPerPartition > 0, "provided ref record per partition is non positive + " + refRecordsPerPartition);

    final int effectiveRecLen = refRecordLen - kSize + 1;
    final List<byte[]> sequenceChunks = new ArrayList<>();
    for ( final SAMSequenceRecord rec : dict.getSequences() ) {
        final String seqName = rec.getSequenceName();
        final int seqLen = rec.getSequenceLength();
        final SimpleInterval interval = new SimpleInterval(seqName, 1, seqLen);
        try {
            final byte[] bases = ref.getReferenceBases(interval).getBases();
            for ( int start = 0; start < seqLen; start += effectiveRecLen ) {
                sequenceChunks.add(Arrays.copyOfRange(bases, start, Math.min(start+refRecordLen, seqLen)));
            }
        }
        catch ( final IOException ioe ) {
            throw new GATKException("Can't get reference sequence bases for " + interval, ioe);
        }
    }

    return ctx.parallelize(sequenceChunks, sequenceChunks.size()/refRecordsPerPartition+1);
}
 
Example 9
Source File: SequenceDictionaryUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18, hg19, b36, or b37) and if it's
 * lexicographically sorted.  Works by matching lengths of the static chr1, chr10, and chr2, and then if these
 * are all matched, requiring that the order be chr1, chr2, chr10.
 *
 * @param dict
 * @return
 */
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
    SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
    for ( SAMSequenceRecord elt : dict.getSequences() ) {
        if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt;
        if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19, CHR2_B36, CHR2_B37) ) chr2 = elt;
        if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19, CHR10_B36, CHR10_B37) ) chr10 = elt;
    }
    if ( chr1 != null  && chr2 != null && chr10 != null) {
        return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() );
    }

    return false;
}
 
Example 10
Source File: SequenceDictionaryUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Utility function that tests whether dict1's set of contigs is a superset of dict2's
 *
 * @param dict1 first sequence dictionary
 * @param dict2 second sequence dictionary
 * @return true if dict1's set of contigs supersets dict2's
 */
private static boolean supersets( SAMSequenceDictionary dict1, SAMSequenceDictionary dict2 ) {
    // Cannot rely on SAMSequenceRecord.equals() as it's too strict (takes extended attributes into account).
    for ( final SAMSequenceRecord dict2Record : dict2.getSequences() ) {
        final SAMSequenceRecord dict1Record = dict1.getSequence(dict2Record.getSequenceName());
        if ( dict1Record == null || ! sequenceRecordsAreEquivalent(dict2Record, dict1Record) ) {
            return false;
        }
    }

    return true;
}
 
Example 11
Source File: BamOverlapChecker.java    From systemsgenetics with GNU General Public License v3.0 5 votes vote down vote up
public BamOverlapChecker(SamReader bam_file){
    
    SAMFileHeader  header = bam_file.getFileHeader();
    SAMSequenceDictionary dict = header.getSequenceDictionary();
    List<SAMSequenceRecord> sequences = dict.getSequences();
   
    
    booleanMap = new HashMap<String, boolean[]>();
    
    for(SAMSequenceRecord sequence : sequences){
        int sequenceEnd = sequence.getSequenceLength();
        int arrayLength = (int) Math.ceil( (float) sequenceEnd / (float)stepSize );
        boolean[] tempArray;
        tempArray = new boolean[arrayLength];
        
        for(int i=0;i<arrayLength;i++){
            SAMRecordIterator bamQuery = bam_file.queryOverlapping(sequence.getSequenceName(), i*stepSize, (i+1)*stepSize);
            if(bamQuery.hasNext()){
                tempArray[i] = true;
            }else{
                tempArray[i] = false;
            }
            bamQuery.close();
        }
        booleanMap.put(sequence.getSequenceName(),tempArray );
        //System.out.println("Finished checking the bam for chromosome " + sequence.getSequenceName());
    }
    
}
 
Example 12
Source File: HalvadeConf.java    From halvade with GNU General Public License v3.0 5 votes vote down vote up
public static void setSequenceDictionary(Configuration conf, SAMSequenceDictionary dict) throws IOException, URISyntaxException {
    int counter = 0;
    for(SAMSequenceRecord seq : dict.getSequences()) {
        conf.set(dictionarySequenceName + counter, seq.getSequenceName());
        conf.setInt(dictionarySequenceLength + counter, seq.getSequenceLength());
        counter++;
    }
    conf.setInt(dictionaryCount, counter);
}
 
Example 13
Source File: ReorderSam.java    From picard with MIT License 5 votes vote down vote up
/**
 * Helper function to print out a sequence dictionary
 */
private void printDictionary(String name, SAMSequenceDictionary dict) {
    log.info(name);
    for (final SAMSequenceRecord contig : dict.getSequences()) {
        log.info(String.format("   SN=%s LN=%d", contig.getSequenceName(), contig.getSequenceLength()));
    }
}
 
Example 14
Source File: SamUtils.java    From rtg-tools with BSD 2-Clause "Simplified" License 5 votes vote down vote up
/**
 * Gets a lookup from sequence names to sequence ids
 * @param dict the sequence dictionary
 * @return the lookup.
 */
public static Map<String, Integer> getSequenceIdLookup(SAMSequenceDictionary dict) {
  final List<SAMSequenceRecord> sequences = dict.getSequences();
  final Map<String, Integer> lookup = new HashMap<>(sequences.size());
  for (SAMSequenceRecord rec : sequences) {
    lookup.put(rec.getSequenceName(), rec.getSequenceIndex());
  }
  return lookup;
}
 
Example 15
Source File: PonApplication.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
private void run() throws IOException, ExecutionException, InterruptedException {

        if (files.isEmpty()) {
            return;
        }

        final VCFFileReader dictionaryReader = new VCFFileReader(files.get(0), true);
        SAMSequenceDictionary dictionary = dictionaryReader.getFileHeader().getSequenceDictionary();
        dictionaryReader.close();

        for (SAMSequenceRecord samSequenceRecord : dictionary.getSequences()) {
            LOGGER.info("Processing sequence {}", samSequenceRecord.getSequenceName());
            final PonBuilder ponBuilder = new PonBuilder();
            final RunnableTaskCompletion runnableTaskCompletion = new RunnableTaskCompletion();

            List<Future<?>> contigFutures = Lists.newArrayList();

            for (Path file : Files.newDirectoryStream(new File(input).toPath(), GLOB)) {
                Runnable runnable = () -> addVariantsFromFileToBuilder(ponBuilder, samSequenceRecord, file);
                contigFutures.add(executorService.submit(runnableTaskCompletion.task(runnable)));
            }

            for (Future<?> contigFuture : contigFutures) {
                contigFuture.get();
            }

            vcf.write(ponBuilder.build());
        }
    }
 
Example 16
Source File: IntervalTagComparatorTest.java    From Drop-seq with MIT License 5 votes vote down vote up
private List<SAMRecord> createManyIntervalTaggedSAMRecords (final int desiredNumRecords) {
	List<SAMRecord> data = new ArrayList<>();

	SamReader inputSam = SamReaderFactory.makeDefault().open(this.dictFile);
	SAMRecord samRecordTemplate = new SAMRecord (inputSam.getFileHeader());

	SAMSequenceDictionary dict= inputSam.getFileHeader().getSequenceDictionary();
	List<SAMSequenceRecord> recs = dict.getSequences();
	int numRecs = recs.size();

	Random randomGenerator = new Random();
	for (int i=0; i<desiredNumRecords; i++) {
		SAMSequenceRecord r = recs.get(randomGenerator.nextInt(numRecs+1));
		String chr = r.getSequenceName();
		int seqLen = r.getSequenceLength();
		int s1 = randomGenerator.nextInt(seqLen);
		int s2 = randomGenerator.nextInt(seqLen);
		int s = Math.min(s1, s2);
		int e = Math.max(s1, s2);
		Interval interval = new Interval (chr, s1,s2);
		try {
			SAMRecord r1 = (SAMRecord) samRecordTemplate.clone();
			// I realize that using encoding the full interval can be a bit heavy handed.
			r1.setAttribute(this.intervalTag, interval.toString());
			data.add(r1);
		} catch (CloneNotSupportedException e1) {
			// this should never happen, sigh.
		}
	}
	return data;
}
 
Example 17
Source File: ReadMetadata.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static Map<String, Integer> buildContigNameToIDMap( final SAMSequenceDictionary dictionary ) {
    final List<SAMSequenceRecord> contigs = dictionary.getSequences();
    final Map<String, Integer> contigNameToID = new HashMap<>(SVUtils.hashMapCapacity(contigs.size()));
    final int nContigs = contigs.size();
    for ( int contigID = 0; contigID < nContigs; ++contigID ) {
        contigNameToID.put(contigs.get(contigID).getSequenceName(), contigID);
    }
    return contigNameToID;
}
 
Example 18
Source File: GenomeLocParserUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testCreateGenomeLocOnContig() throws IOException {
    try(final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(
            IOUtils.getPath(exampleReference))) {
        final SAMSequenceDictionary dict = seq.getSequenceDictionary();
        final GenomeLocParser genomeLocParser = new GenomeLocParser(dict);

        for (final SAMSequenceRecord rec : dict.getSequences()) {
            final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName());
            Assert.assertEquals(loc.getContig(), rec.getSequenceName());
            Assert.assertEquals(loc.getStart(), 1);
            Assert.assertEquals(loc.getStop(), rec.getSequenceLength());
        }
    }
}
 
Example 19
Source File: SequenceDictionaryUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
    Set<String> contigNames = new LinkedHashSet<String>(Utils.optimumHashSize(dict.size()));
    for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
        contigNames.add(dictionaryEntry.getSequenceName());
    return contigNames;
}
 
Example 20
Source File: GatherGeneGCLength.java    From Drop-seq with MIT License 4 votes vote down vote up
@Override
   protected int doWork() {

       IOUtil.assertFileIsReadable(ANNOTATIONS_FILE);
       IOUtil.assertFileIsWritable(this.OUTPUT);
       PrintStream out = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT));
       writeHeader(out);

       PrintStream outTranscript = null;
       if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) {
		outTranscript = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT_TRANSCRIPT_LEVEL));
		writeHeaderTranscript(outTranscript);
       }

       FastaSequenceFileWriter  outSequence = null;

       if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) {
       	IOUtil.assertFileIsWritable(this.OUTPUT_TRANSCRIPT_SEQUENCES);
		outSequence = new FastaSequenceFileWriter (this.OUTPUT_TRANSCRIPT_SEQUENCES);
       }
       ReferenceSequenceFileWalker refFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

       SAMSequenceDictionary dict= refFileWalker.getSequenceDictionary();
       if (dict==null) {
       	CloserUtil.close(refFileWalker);
       	throw new IllegalArgumentException("Reference file" + this.REFERENCE_SEQUENCE.getAbsolutePath()+" is missing a dictionary file [.dict].  Please make one!");
       }

       OverlapDetector<Gene> geneOverlapDetector= GeneAnnotationReader.loadAnnotationsFile(ANNOTATIONS_FILE, dict);

       List<SAMSequenceRecord> records = dict.getSequences();

	for (SAMSequenceRecord record: records) {
		String seqName = record.getSequenceName();
		int seqIndex=dict.getSequenceIndex(seqName);
		ReferenceSequence fastaRef=refFileWalker.get(seqIndex);

		// get the genes for this contig.
		Interval i = new Interval(seqName, 1, record.getSequenceLength());
		Collection< Gene> genes = geneOverlapDetector.getOverlaps(i);
		for (Gene g: genes) {
			List<GCResult> gcList = calculateGCContentGene(g, fastaRef, dict);
			if (this.OUTPUT_TRANSCRIPT_LEVEL!=null)
				writeResultTranscript(gcList, outTranscript);
			GCIsoformSummary summary = new GCIsoformSummary(g, gcList);
			if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null)
				writeTranscriptSequence(g, fastaRef, dict, outSequence);

			GCResult gc = calculateGCContentUnionExons(g, fastaRef, dict);

			writeResult(gc, summary, out);
		}
	}
	CloserUtil.close(refFileWalker);
	CloserUtil.close(out);
	if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) CloserUtil.close(outTranscript);
	if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) CloserUtil.close(outSequence);
       return 0;
}