Java Code Examples for htsjdk.samtools.SAMSequenceRecord#getSequenceName()

The following examples show how to use htsjdk.samtools.SAMSequenceRecord#getSequenceName() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VcfHeader.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * Add contig lines corresponding to the sequences present in a SAM header.
 * @param header the SAM header.
 */
public void addContigFields(SAMFileHeader header) {
  final SAMSequenceDictionary dic =  header.getSequenceDictionary();
  for (final SAMSequenceRecord seq : dic.getSequences()) {
    final ContigField f = new ContigField(seq.getSequenceName(), seq.getSequenceLength());
    if (seq.getAttribute(SAMSequenceRecord.ASSEMBLY_TAG) != null) {
      f.put("as", seq.getAttribute(SAMSequenceRecord.ASSEMBLY_TAG));
    }
    if (seq.getAttribute(SAMSequenceRecord.MD5_TAG) != null) {
      f.put("md5", seq.getAttribute(SAMSequenceRecord.MD5_TAG));
    }
    if (seq.getAttribute(SAMSequenceRecord.SPECIES_TAG) != null) {
      f.put("species", seq.getAttribute(SAMSequenceRecord.SPECIES_TAG));
    }
    addContigField(f);
  }
}
 
Example 2
Source File: FindBadGenomicKmersSparkUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(groups = "sv")
public void miniRefTest() throws IOException {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final ReferenceMultiSparkSource ref = new ReferenceMultiSparkSource(
            REFERENCE_FILE_NAME, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(null);
    if ( dict == null ) throw new GATKException("No reference dictionary available.");

    final Map<SVKmer, Long> kmerMap = new LinkedHashMap<>();
    for ( final SAMSequenceRecord rec : dict.getSequences() ) {
        final SimpleInterval interval = new SimpleInterval(rec.getSequenceName(), 1, rec.getSequenceLength());
        final byte[] bases = ref.getReferenceBases(interval).getBases();
        SVKmerizer.canonicalStream(bases, KMER_SIZE, new SVKmerLong())
                .forEach(kmer -> kmerMap.put(kmer, kmerMap.getOrDefault(kmer, 0L) + 1));
    }
    kmerMap.entrySet().removeIf( x -> x.getValue() <= FindBadGenomicKmersSpark.MAX_KMER_FREQ);

    final List<SVKmer> badKmers =
            FindBadGenomicKmersSpark.findBadGenomicKmers(ctx, KMER_SIZE, Integer.MAX_VALUE, ref, null);
    final Set<SVKmer> badKmerSet = new HashSet<>(badKmers);
    Assert.assertEquals(badKmers.size(), badKmerSet.size());
    Assert.assertEquals(badKmerSet, kmerMap.keySet());
}
 
Example 3
Source File: CreateSequenceDictionary.java    From picard with MIT License 6 votes vote down vote up
/**
 * Read all the sequences from the given reference file, and convert into SAMSequenceRecords
 *
 * @param referenceFile fasta or fasta.gz
 * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
 */
public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final Iterable<SAMSequenceRecord> samSequenceRecordsIterable = getSamSequenceRecordsIterable();

    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    for (SAMSequenceRecord rec : samSequenceRecordsIterable) {

        if (sequenceNames.contains(rec.getSequenceName())) {
            throw new PicardException("Sequence name appears more than once in reference: " + rec.getSequenceName());
        }
        sequenceNames.add(rec.getSequenceName());
        ret.add(rec);
    }
    return new SAMSequenceDictionary(ret);
}
 
Example 4
Source File: SamRangeUtils.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
static <T> void validateRanges(SAMFileHeader header, ReferenceRanges<T> rangeMap) {
  for (final String seq : rangeMap.sequenceNames()) {
    final SAMSequenceRecord r  = header.getSequenceDictionary().getSequence(seq);
    if (r == null) {
      throw new NoTalkbackSlimException("Sequence \"" + seq + "\" referenced in regions not found in the SAM sequence dictionary.");
    }

    if (r.getSequenceLength() > 0) {
      final RangeList<T> rs = rangeMap.get(seq);
      if (rs != null) {
        final List<? extends Interval> ranges = rs.getRangeList();
        final Interval last = ranges.get(ranges.size() - 1);
        if (last.getEnd() >  r.getSequenceLength()) {
          throw new NoTalkbackSlimException("Specified sequence range (" + r.getSequenceName() + ":" + last + ") is outside the length of the sequence (" + r.getSequenceLength() + ")");
        }
      }
    }
  }
}
 
Example 5
Source File: AnnotateTargetsIntegrationTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private List<SimpleInterval> createRandomIntervals(final SAMSequenceDictionary referenceDictionary, final int numberOfIntervals, final int minIntervalSize, final int maxIntervalSize, final int meanIntervalSize, final double intervalSizeStdev) {
    final List<SimpleInterval> result = new ArrayList<>(numberOfIntervals);
    final int numberOfSequences = referenceDictionary.getSequences().size();
    for (int i = 0; i < numberOfIntervals; i++) {
        final SAMSequenceRecord contig = referenceDictionary.getSequence(RANDOM.nextInt(numberOfSequences));
        final String contigName = contig.getSequenceName();
        final int intervalSize = Math.min(maxIntervalSize, (int) Math.max(minIntervalSize, Math.round(RANDOM.nextDouble() * intervalSizeStdev + meanIntervalSize)));
        final int intervalStart = 1 + RANDOM.nextInt(contig.getSequenceLength() - intervalSize);
        final int intervalEnd = intervalStart + intervalSize - 1;
        final SimpleInterval interval = new SimpleInterval(contigName, intervalStart, intervalEnd);
        result.add(interval);
    }

    final Comparator<SimpleInterval> comparator =
            Comparator.comparing(SimpleInterval::getContig,
                    (a, b) -> Integer.compare(
                            referenceDictionary.getSequenceIndex(a),
                            referenceDictionary.getSequenceIndex(b)))
            .thenComparingInt(SimpleInterval::getStart)
            .thenComparingInt(SimpleInterval::getEnd);
    Collections.sort(result, comparator);
    return result;
}
 
Example 6
Source File: ReAligner.java    From abra2 with MIT License 6 votes vote down vote up
private List<Feature> getRegionsNoBed(int readLength, SAMFileHeader header) throws IOException {
	
	List<Feature> regions = new ArrayList<Feature>();
	
	List<SAMSequenceRecord> refSeq = header.getSequenceDictionary().getSequences();
	
	for (SAMSequenceRecord seq : refSeq) {
		if (!chromosomeSkipRegex.matches(seq.getSequenceName())) {
			Feature region = new Feature(seq.getSequenceName(), 1, seq.getSequenceLength());
			regions.add(region);
		}
	}
	
	regions = RegionLoader.collapseRegions(regions, readLength);
	regions = splitRegions(regions);
	
	return regions;
}
 
Example 7
Source File: PicardIndexedFastaSequenceFile.java    From chipster with MIT License 6 votes vote down vote up
/**
 * Do some basic checking to make sure the dictionary and the index match.
 * @param fastaFile Used for error reporting only.
 * @param sequenceDictionary sequence dictionary to check against the index.
 * @param index index file to check against the dictionary.
 */
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
                                                        final SAMSequenceDictionary sequenceDictionary,
                                                        final FastaSequenceIndex index) {
    // Make sure dictionary and index are the same size.
    if( sequenceDictionary.getSequences().size() != index.size() )
        throw new SAMException("Sequence dictionary and index contain different numbers of contigs");

    Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
    Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();

    while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
        SAMSequenceRecord sequenceEntry = sequenceIterator.next();
        FastaSequenceIndexEntry indexEntry = indexIterator.next();

        if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
            throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
                    fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
        }

        // Make sure sequence length matches index length.
        if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
            throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
    }
}
 
Example 8
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 6 votes vote down vote up
private void processByPartialContig (final ReferenceSequenceFile ref, final FastaSequenceFileWriter writer, final File intervalListFile) {
	SAMSequenceDictionary sd = ref.getSequenceDictionary();
	// validate that the intervals and the reference have the same sequence dictionary.
	IntervalList iList = IntervalList.fromFile(intervalListFile);
	iList.getHeader().getSequenceDictionary().assertSameDictionary(sd);
	// map the intervals to a map to each contig.
	Map<String, List<Interval>> intervalsPerContig = getIntervalsForContig(iList);

	for (SAMSequenceRecord r: sd.getSequences()) {
		String contig = r.getSequenceName();
		log.info("Processing partial contig " + contig);
		// this list can be null.
		List<Interval> intervalsToMask = intervalsPerContig.get(contig);
		ReferenceSequence rs = ref.getSequence(contig);
		writeSequence(rs, intervalsToMask, writer);
	}
}
 
Example 9
Source File: BamDataSource.java    From chipster with MIT License 5 votes vote down vote up
/**
  * Generally we would like to have both data and index files,
  * because otherwise we cannot access random locations.
  * 
  * @param data
  * @param index
  * @throws URISyntaxException
  * @throws IOException 
  */
 public BamDataSource(DataUrl data, DataUrl index) throws URISyntaxException, IOException {
     super(data);

 	// BAMFileReader emits useless warning to System.err that can't be turned off,
 	// so we direct it to other stream and discard. 
 	PrintStream originalErr = System.err;
 	System.setErr(new PrintStream(new ByteArrayOutputStream()));
 	
 	SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
 	this.reader = SamBamUtils.getSAMReader(data.getUrl(), index.getUrl());

 	LinkedList<String> chrList = new LinkedList<>();
 	
 	// Iterate chromosomes to check naming convention
 	for (SAMSequenceRecord sequenceRecord : this.reader.getFileHeader().getSequenceDictionary().getSequences()) {
 		
 		String name = sequenceRecord.getSequenceName();
 		
chrList.add(name);
 	}
 	
 	// Create unnormaliser for this naming convention
 	// Look only at the first sequence (assume all have the same convention)
 	this.chromosomeNameUnnormaliser = new ChromosomeNameUnnormaliser(chrList);      
     
     // Restore System.err
     System.setErr(originalErr);
 }
 
Example 10
Source File: IntervalTagComparatorTest.java    From Drop-seq with MIT License 5 votes vote down vote up
private List<SAMRecord> createManyIntervalTaggedSAMRecords (final int desiredNumRecords) {
	List<SAMRecord> data = new ArrayList<>();

	SamReader inputSam = SamReaderFactory.makeDefault().open(this.dictFile);
	SAMRecord samRecordTemplate = new SAMRecord (inputSam.getFileHeader());

	SAMSequenceDictionary dict= inputSam.getFileHeader().getSequenceDictionary();
	List<SAMSequenceRecord> recs = dict.getSequences();
	int numRecs = recs.size();

	Random randomGenerator = new Random();
	for (int i=0; i<desiredNumRecords; i++) {
		SAMSequenceRecord r = recs.get(randomGenerator.nextInt(numRecs+1));
		String chr = r.getSequenceName();
		int seqLen = r.getSequenceLength();
		int s1 = randomGenerator.nextInt(seqLen);
		int s2 = randomGenerator.nextInt(seqLen);
		int s = Math.min(s1, s2);
		int e = Math.max(s1, s2);
		Interval interval = new Interval (chr, s1,s2);
		try {
			SAMRecord r1 = (SAMRecord) samRecordTemplate.clone();
			// I realize that using encoding the full interval can be a bit heavy handed.
			r1.setAttribute(this.intervalTag, interval.toString());
			data.add(r1);
		} catch (CloneNotSupportedException e1) {
			// this should never happen, sigh.
		}
	}
	return data;
}
 
Example 11
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 5 votes vote down vote up
private Set<String> selectContigsToIgnore (final SAMSequenceDictionary sd, final List<String> patterns) {
	Set<String> result = new HashSet<>();
	for (SAMSequenceRecord r:  sd.getSequences()) {
		String contigName = r.getSequenceName();
		if (containsSubString(contigName, patterns))
			result.add(contigName);
	}
	return result;
}
 
Example 12
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 5 votes vote down vote up
private void processByWholeContig (final ReferenceSequenceFile ref, final FastaSequenceFileWriter writer, final List<String> contigPatternToIgnore) {
	SAMSequenceDictionary sd = ref.getSequenceDictionary();
	Set<String> contigsToIgnore = selectContigsToIgnore(sd, contigPatternToIgnore);
	// write out each contig.

	for (SAMSequenceRecord r: sd.getSequences()) {
		String contig = r.getSequenceName();
		log.info("Processing complete contig " + contig);
		ReferenceSequence rs = ref.getSequence(contig);
		boolean setSequenceToN = contigsToIgnore.contains(contig);
		writeSequence(rs, setSequenceToN, writer);
	}

}
 
Example 13
Source File: HeaderInfo.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
public static HeaderInfo getHeaderFromBAMFile(Storage.Objects storage, String BAMPath, Iterable<Contig> explicitlyRequestedContigs) throws IOException {
  HeaderInfo result = null;

  // Open and read start of BAM
  LOG.info("Reading header from " + BAMPath);
  final SamReader samReader = BAMIO
      .openBAM(storage, BAMPath, ValidationStringency.DEFAULT_STRINGENCY);
  final SAMFileHeader header = samReader.getFileHeader();
  Contig firstContig = getFirstExplicitContigOrNull(header, explicitlyRequestedContigs);
  if (firstContig == null) {
    final SAMSequenceRecord seqRecord = header.getSequence(0);
    firstContig = new Contig(seqRecord.getSequenceName(), -1, -1);
  }

  LOG.info("Reading first chunk of reads from " + BAMPath);
  final SAMRecordIterator recordIterator = samReader.query(
      firstContig.referenceName, (int)firstContig.start + 1, (int)firstContig.end + 1, false);

  Contig firstShard = null;
  while (recordIterator.hasNext() && result == null) {
    SAMRecord record = recordIterator.next();
    final int alignmentStart = record.getAlignmentStart();
    if (firstShard == null && alignmentStart > firstContig.start &&
        (alignmentStart < firstContig.end || firstContig.end == -1)) {
      firstShard = new Contig(firstContig.referenceName, alignmentStart, alignmentStart);
      LOG.info("Determined first shard to be " + firstShard);
      result = new HeaderInfo(header, firstShard);
    }
  }
  recordIterator.close();
  samReader.close();

  if (result == null) {
    throw new IOException("Did not find reads for the first contig " + firstContig.toString());
  }
  LOG.info("Finished header reading from " + BAMPath);
  return result;
}
 
Example 14
Source File: GenomeLocParser.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * validate a position or interval on the genome as valid
 *
 * Requires that contig exist in the master sequence dictionary, and that contig index be valid as well.  Requires
 * that start <= stop.
 *
 * if mustBeOnReference is true,
 * performs boundary validation for genome loc INTERVALS:
 * start and stop are on contig and start <= stop
 *
 * @param contig the contig name
 * @param start  the start position
 * @param stop   the stop position
 *
 * @return the interned contig name, an optimization that ensures that contig == the string in the sequence dictionary
 */
protected String validateGenomeLoc(final String contig, final int contigIndex, final int start, final int stop, final boolean mustBeOnReference) {
    if ( validationLevel == ValidationLevel.NONE )
        return contig;
    else {
        if (stop < start)
            vglHelper(String.format("The stop position %d is less than start %d in contig %s", stop, start, contig));

        final SAMSequenceRecord contigInfo = this.contigInfo.getSequence(contig);
        if ( contigInfo.getSequenceIndex() != contigIndex )
            vglHelper(String.format("The contig index %d is bad, doesn't equal the contig index %d of the contig from a string %s",
                    contigIndex, contigInfo.getSequenceIndex(), contig));

        if ( mustBeOnReference ) {
            if (start < 1)
                vglHelper(String.format("The start position %d is less than 1", start));

            if (stop < 1)
                vglHelper(String.format("The stop position %d is less than 1", stop));

            final int contigSize = contigInfo.getSequenceLength();
            if (contigSize == SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH) {
                logger.warn(String.format("The available sequence dictionary does not contain a sequence length for contig (%s). " +
                        "Skipping validation of the genome loc end coordinate (%d).",
                        contig, stop));
            }
            else if (start > contigSize || stop > contigSize) {
                vglHelper(String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize));
            }
        }

        return contigInfo.getSequenceName();
    }
}
 
Example 15
Source File: MRUCachingSAMSequenceDictionary.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * The key algorithm.  Given a new record, update the last used record, contig
 * name, and index.
 *
 * @param contig the contig we want to look up.  If null, index is used instead
 * @param index the contig index we want to look up.  Only used if contig is null
 * @throws GATKException if index isn't present in the dictionary
 * @return the SAMSequenceRecord for contig / index
 */
private SAMSequenceRecord updateCache(final String contig, int index ) {
    SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
    if ( rec == null ) {
        throw new GATKException("BUG: requested unknown contig=" + contig + " index=" + index);
    } else {
        lastSSR = rec;
        lastContig = rec.getSequenceName();
        lastIndex = rec.getSequenceIndex();
        return rec;
    }
}
 
Example 16
Source File: SVReferenceUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Create an RDD from the reference sequences.
 * The reference sequences are transformed into a single, large collection of byte arrays.
 * The collection is then parallelized into an RDD.
 * Each contig that exceeds a size given by {@code refRecordLen} is broken into a series of {@code refRecordLen}
 * chunks with a {@code kSize} - 1 base overlap between successive chunks.
 * (I.e., for {@code kSize} = 63, the last 62 bases in chunk n match the first 62 bases in chunk n+1)
 * so that we don't miss any kmers due to the chunking -- we can just kmerize each record independently.
 */
public static JavaRDD<byte[]> getReferenceBasesRDD(final JavaSparkContext ctx,
                                                   final int kSize,
                                                   final ReferenceMultiSparkSource ref,
                                                   final SAMSequenceDictionary dict,
                                                   final int refRecordLen,
                                                   final int refRecordsPerPartition) {
    Utils.nonNull(dict, "provided dictionary is null");
    Utils.validateArg(kSize!=0, "provided kmer size is zero");
    Utils.validateArg(refRecordLen > 0, "provided ref record length is non positive + " + refRecordLen);
    Utils.validateArg(refRecordsPerPartition > 0, "provided ref record per partition is non positive + " + refRecordsPerPartition);

    final int effectiveRecLen = refRecordLen - kSize + 1;
    final List<byte[]> sequenceChunks = new ArrayList<>();
    for ( final SAMSequenceRecord rec : dict.getSequences() ) {
        final String seqName = rec.getSequenceName();
        final int seqLen = rec.getSequenceLength();
        final SimpleInterval interval = new SimpleInterval(seqName, 1, seqLen);
        try {
            final byte[] bases = ref.getReferenceBases(interval).getBases();
            for ( int start = 0; start < seqLen; start += effectiveRecLen ) {
                sequenceChunks.add(Arrays.copyOfRange(bases, start, Math.min(start+refRecordLen, seqLen)));
            }
        }
        catch ( final IOException ioe ) {
            throw new GATKException("Can't get reference sequence bases for " + interval, ioe);
        }
    }

    return ctx.parallelize(sequenceChunks, sequenceChunks.size()/refRecordsPerPartition+1);
}
 
Example 17
Source File: PSBuildReferenceTaxonomyUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Build set of accessions contained in the reference.
 * Returns: a map from accession to the name and length of the record. If the sequence name contains the
 * taxonomic ID, it instead gets added to taxIdToProperties. Later we merge both results into taxIdToProperties.
 * Method: First, look for either "taxid|<taxid>|" or "ref|<accession>|" in the sequence name. If neither of
 * those are found, use the first word of the name as the accession.
 */
protected static Map<String, Tuple2<String, Long>> parseReferenceRecords(final List<SAMSequenceRecord> dictionaryList,
                                                                         final Map<Integer, PSPathogenReferenceTaxonProperties> taxIdToProperties) {

    final Map<String, Tuple2<String, Long>> accessionToNameAndLength = new HashMap<>();
    for (final SAMSequenceRecord record : dictionaryList) {
        final String recordName = record.getSequenceName();
        final long recordLength = record.getSequenceLength();
        final String[] tokens = recordName.split(VERTICAL_BAR_DELIMITER_REGEX);
        String recordAccession = null;
        int recordTaxId = PSTree.NULL_NODE;
        for (int i = 0; i < tokens.length - 1 && recordTaxId == PSTree.NULL_NODE; i++) {
            if (tokens[i].equals("ref")) {
                recordAccession = tokens[i + 1];
            } else if (tokens[i].equals("taxid")) {
                recordTaxId = parseTaxonId(tokens[i + 1]);
            }
        }
        if (recordTaxId == PSTree.NULL_NODE) {
            if (recordAccession == null) {
                final String[] tokens2 = tokens[0].split(" "); //Default accession to first word in the name
                recordAccession = tokens2[0];
            }
            accessionToNameAndLength.put(recordAccession, new Tuple2<>(recordName, recordLength));
        } else {
            addReferenceAccessionToTaxon(recordTaxId, recordName, recordLength, taxIdToProperties);
        }
    }
    return accessionToNameAndLength;
}
 
Example 18
Source File: GatherGeneGCLength.java    From Drop-seq with MIT License 4 votes vote down vote up
@Override
   protected int doWork() {

       IOUtil.assertFileIsReadable(ANNOTATIONS_FILE);
       IOUtil.assertFileIsWritable(this.OUTPUT);
       PrintStream out = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT));
       writeHeader(out);

       PrintStream outTranscript = null;
       if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) {
		outTranscript = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT_TRANSCRIPT_LEVEL));
		writeHeaderTranscript(outTranscript);
       }

       FastaSequenceFileWriter  outSequence = null;

       if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) {
       	IOUtil.assertFileIsWritable(this.OUTPUT_TRANSCRIPT_SEQUENCES);
		outSequence = new FastaSequenceFileWriter (this.OUTPUT_TRANSCRIPT_SEQUENCES);
       }
       ReferenceSequenceFileWalker refFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

       SAMSequenceDictionary dict= refFileWalker.getSequenceDictionary();
       if (dict==null) {
       	CloserUtil.close(refFileWalker);
       	throw new IllegalArgumentException("Reference file" + this.REFERENCE_SEQUENCE.getAbsolutePath()+" is missing a dictionary file [.dict].  Please make one!");
       }

       OverlapDetector<Gene> geneOverlapDetector= GeneAnnotationReader.loadAnnotationsFile(ANNOTATIONS_FILE, dict);

       List<SAMSequenceRecord> records = dict.getSequences();

	for (SAMSequenceRecord record: records) {
		String seqName = record.getSequenceName();
		int seqIndex=dict.getSequenceIndex(seqName);
		ReferenceSequence fastaRef=refFileWalker.get(seqIndex);

		// get the genes for this contig.
		Interval i = new Interval(seqName, 1, record.getSequenceLength());
		Collection< Gene> genes = geneOverlapDetector.getOverlaps(i);
		for (Gene g: genes) {
			List<GCResult> gcList = calculateGCContentGene(g, fastaRef, dict);
			if (this.OUTPUT_TRANSCRIPT_LEVEL!=null)
				writeResultTranscript(gcList, outTranscript);
			GCIsoformSummary summary = new GCIsoformSummary(g, gcList);
			if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null)
				writeTranscriptSequence(g, fastaRef, dict, outSequence);

			GCResult gc = calculateGCContentUnionExons(g, fastaRef, dict);

			writeResult(gc, summary, out);
		}
	}
	CloserUtil.close(refFileWalker);
	CloserUtil.close(out);
	if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) CloserUtil.close(outTranscript);
	if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) CloserUtil.close(outSequence);
       return 0;
}
 
Example 19
Source File: AlignmentsTagsTest.java    From cramtools with Apache License 2.0 4 votes vote down vote up
private void doTest(byte[] ref, int alignmentStart, byte[] readBases) throws IOException,
		CloneNotSupportedException {
	SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("1", ref.length);
	SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary();
	sequenceDictionary.addSequence(sequenceRecord);

	SAMFileHeader header = new SAMFileHeader();
	header.setSequenceDictionary(sequenceDictionary);
	SAMRecord samRecord = new SAMRecord(header);
	samRecord.setReadUnmappedFlag(false);
	samRecord.setAlignmentStart(alignmentStart);
	samRecord.setReferenceIndex(0);
	samRecord.setReadBases(readBases);
	samRecord.setCigarString(samRecord.getReadLength() + "M");

	ReferenceSource referenceSource = new ReferenceSource() {
		@Override
		public synchronized ReferenceRegion getRegion(SAMSequenceRecord record, int start_1based,
				int endInclusive_1based) throws IOException {
			int zbInclusiveStart = start_1based - 1;
			int zbExlcusiveEnd = endInclusive_1based;
			return new ReferenceRegion(Arrays.copyOfRange(ref, zbInclusiveStart, zbExlcusiveEnd),
					sequenceRecord.getSequenceIndex(), sequenceRecord.getSequenceName(), start_1based);
		}
	};

	AlignmentsTags.calculateMdAndNmTags(samRecord, referenceSource, sequenceDictionary, true, true);

	SAMRecord checkRecord = (SAMRecord) samRecord.clone();
	SequenceUtil.calculateMdAndNmTags(checkRecord, ref, true, true);
	// System.out.printf("TEST: ref %s, start %d, read bases %s\n", new
	// String(ref), alignmentStart, new String(
	// readBases));
	// System.out
	// .println(referenceSource.getRegion(sequenceRecord, alignmentStart,
	// alignmentStart + readBases.length));
	// System.out.printf("NM:  %s x %s\n", samRecord.getAttribute("NM"),
	// checkRecord.getAttribute("NM"));
	// System.out.printf("MD: %s x %s\n", samRecord.getAttribute("MD"),
	// checkRecord.getAttribute("MD"));

	Assert.assertEquals(checkRecord.getAttribute("NM"), samRecord.getAttribute("NM"));
	Assert.assertEquals(checkRecord.getAttribute("MD"), samRecord.getAttribute("MD"));
}
 
Example 20
Source File: Sharder.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
void createShardsForReference(SAMSequenceRecord reference, Contig contig) {
  LOG.info("Creating shard for: " + contig);
  final BitSet overlappingBins = GenomicIndexUtil.regionToBins(
      (int) contig.start, (int) contig.end);
  if (overlappingBins == null) {
    LOG.warning("No overlapping bins for " + contig.start + ":" + contig.end);
    return;
  }


  BAMShard currentShard = null;
  for (int binIndex = overlappingBins.nextSetBit(0); binIndex >= 0; binIndex = overlappingBins.nextSetBit(binIndex + 1)) {
    final Bin bin = index.getBinData(reference.getSequenceIndex(), binIndex);
    if (bin == null) {
      continue;
    }
    if (LOG.isLoggable(Level.FINE)) {
      LOG.fine("Processing bin " + index.getFirstLocusInBin(bin) + "-"
          + index.getLastLocusInBin(bin));
    }

    if (index.getLevelForBin(bin) != (GenomicIndexUtil.LEVEL_STARTS.length - 1)) {
      if (LOG.isLoggable(Level.FINEST)) {
        LOG.finest("Skipping - not lowest");
      }
      continue;
      // Skip non-lowest level bins
      // Its ok to skip higher level bins
      // because in BAMShard#finalize we
      // will get all chunks overlapping a genomic region that
      // we end up having for the shard, so we will not
      // miss any reads on the boundary.
    }
    final List<Chunk> chunksInBin = bin.getChunkList();
    if (chunksInBin.isEmpty()) {
      if (LOG.isLoggable(Level.FINEST)) {
        LOG.finest("Skipping - empty");
      }
      continue; // Skip empty bins
    }

    if (currentShard == null) {
      int startLocus =
          Math.max(index.getFirstLocusInBin(bin), (int)contig.start);

      if (LOG.isLoggable(Level.FINE)) {
        LOG.fine("Creating shard starting from " + startLocus);
      }
      currentShard = new BAMShard(filePath, reference.getSequenceName(),
          startLocus);
    }
    currentShard.addBin(chunksInBin, index.getLastLocusInBin(bin));
    if (LOG.isLoggable(Level.FINE)) {
      LOG.fine("Extending the shard  to " + index.getLastLocusInBin(bin));
    }

    if (shardingPolicy.apply(currentShard)) {
      LOG.info("Shard size is big enough to finalize: " +
          currentShard.sizeInLoci() + ", " + currentShard.approximateSizeInBytes() + " bytes");
      final BAMShard bamShard = currentShard.finalize(index, Math.min(index.getLastLocusInBin(bin), (int)contig.end));
      LOG.info("Outputting shard: " + bamShard.contig);
      output.output(bamShard);
      currentShard = null;
    }
  }
  if (currentShard != null) {
    LOG.info("Outputting last shard of size " +
        currentShard.sizeInLoci() + ", " + currentShard.approximateSizeInBytes() + " bytes "
        + currentShard.contig);
    output.output(currentShard.finalize(index, (int)contig.end));
  }
}