htsjdk.samtools.SAMSequenceRecord Java Examples

The following examples show how to use htsjdk.samtools.SAMSequenceRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CreateSequenceDictionary.java    From picard with MIT License 6 votes vote down vote up
/**
 * Read all the sequences from the given reference file, and convert into SAMSequenceRecords
 *
 * @param referenceFile fasta or fasta.gz
 * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
 */
public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final Iterable<SAMSequenceRecord> samSequenceRecordsIterable = getSamSequenceRecordsIterable();

    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    for (SAMSequenceRecord rec : samSequenceRecordsIterable) {

        if (sequenceNames.contains(rec.getSequenceName())) {
            throw new PicardException("Sequence name appears more than once in reference: " + rec.getSequenceName());
        }
        sequenceNames.add(rec.getSequenceName());
        ret.add(rec);
    }
    return new SAMSequenceDictionary(ret);
}
 
Example #2
Source File: MaskReferenceSequence.java    From Drop-seq with MIT License 6 votes vote down vote up
private void processByPartialContig (final ReferenceSequenceFile ref, final FastaSequenceFileWriter writer, final File intervalListFile) {
	SAMSequenceDictionary sd = ref.getSequenceDictionary();
	// validate that the intervals and the reference have the same sequence dictionary.
	IntervalList iList = IntervalList.fromFile(intervalListFile);
	iList.getHeader().getSequenceDictionary().assertSameDictionary(sd);
	// map the intervals to a map to each contig.
	Map<String, List<Interval>> intervalsPerContig = getIntervalsForContig(iList);

	for (SAMSequenceRecord r: sd.getSequences()) {
		String contig = r.getSequenceName();
		log.info("Processing partial contig " + contig);
		// this list can be null.
		List<Interval> intervalsToMask = intervalsPerContig.get(contig);
		ReferenceSequence rs = ref.getSequence(contig);
		writeSequence(rs, intervalsToMask, writer);
	}
}
 
Example #3
Source File: Merge.java    From cramtools with Apache License 2.0 6 votes vote down vote up
private static SAMFileHeader mergeHeaders(List<RecordSource> sources) {
	SAMFileHeader header = new SAMFileHeader();
	for (RecordSource source : sources) {
		SAMFileHeader h = source.reader.getFileHeader();

		for (SAMSequenceRecord seq : h.getSequenceDictionary().getSequences()) {
			if (header.getSequenceDictionary().getSequence(seq.getSequenceName()) == null)
				header.addSequence(seq);
		}

		for (SAMProgramRecord pro : h.getProgramRecords()) {
			if (header.getProgramRecord(pro.getProgramGroupId()) == null)
				header.addProgramRecord(pro);
		}

		for (String comment : h.getComments())
			header.addComment(comment);

		for (SAMReadGroupRecord rg : h.getReadGroups()) {
			if (header.getReadGroup(rg.getReadGroupId()) == null)
				header.addReadGroup(rg);
		}
	}

	return header;
}
 
Example #4
Source File: SequenceDictionaryUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Returns a compact String representation of the sequence dictionary it's passed
 *
 * The format of the returned String is:
 * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ]
 *
 * @param dict a non-null SAMSequenceDictionary
 * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
 */
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
    Utils.nonNull(dict, "Sequence dictionary must be non-null");

    StringBuilder s = new StringBuilder("[ ");

    for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
        s.append(dictionaryEntry.getSequenceName());
        s.append("(length:");
        s.append(dictionaryEntry.getSequenceLength());
        s.append(") ");
    }

    s.append("]");

    return s.toString();
}
 
Example #5
Source File: IntervalUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(expectedExceptions={UserException.CouldNotReadInputFile.class, UserException.MalformedGenomeLoc.class, UserException.MalformedFile.class}, dataProvider="invalidIntervalTestData")
public void testIntervalFileToListInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser,
                                   String contig, int intervalStart, int intervalEnd ) throws Exception {

    final SAMFileHeader picardFileHeader = new SAMFileHeader();
    picardFileHeader.addSequence(genomeLocParser.getContigInfo("1"));
    picardFileHeader.addSequence(new SAMSequenceRecord("2", 100000));

    final IntervalList picardIntervals = new IntervalList(picardFileHeader);
    picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname"));

    final File picardIntervalFile = createTempFile("testIntervalFileToListInvalidPicardIntervalHandling", ".interval_list");
    picardIntervals.write(picardIntervalFile);

    // loadIntervals() will validate all intervals against the sequence dictionary in our genomeLocParser,
    // and should throw for all intervals in our invalidIntervalTestData set
    IntervalUtils.parseIntervalArguments(genomeLocParser, picardIntervalFile.getAbsolutePath());
}
 
Example #6
Source File: CreateSequenceDictionary.java    From varsim with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * Create one SAMSequenceRecord from a single fasta sequence
 */
private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) {
  final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), refSeq.length());

  // Compute MD5 of upcased bases
  final byte[] bases = refSeq.getBases();
  for (int i = 0; i < bases.length; ++i) {
    bases[i] = StringUtil.toUpperCase(bases[i]);
  }

  ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
  if (GENOME_ASSEMBLY != null) {
    ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
  }
  ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
  if (SPECIES != null) {
    ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
  }
  return ret;
}
 
Example #7
Source File: ChromosomeLengthFactory.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
@NotNull
public static List<ChromosomeLength> create(@NotNull final SAMSequenceDictionary dictionary) {
    final List<ChromosomeLength> results = Lists.newArrayList();

    for (final SAMSequenceRecord samSequenceRecord : dictionary.getSequences()) {
        final String sequenceName = samSequenceRecord.getSequenceName();
        if (HumanChromosome.contains(sequenceName)) {
            results.add(ImmutableChromosomeLength.builder()
                    .chromosome(sequenceName)
                    .length(samSequenceRecord.getSequenceLength())
                    .build());
        }
    }

    return results;
}
 
Example #8
Source File: SequenceDictionaryIntersectionTest.java    From Drop-seq with MIT License 6 votes vote down vote up
@Test
public void testNoMatch() {
    final File leftDictFile = new File(TESTDATA_DIR, CHR_BASENAME + "sam");
    final SamReader leftDict = SamReaderFactory.makeDefault().open(leftDictFile);
    try {
        final SAMSequenceDictionary nonMatchingDict = new SAMSequenceDictionary(
                leftDict.getFileHeader().getSequenceDictionary().getSequences().stream().
                        map((s) -> new SAMSequenceRecord("xyz_" + s.getSequenceName(), s.getSequenceLength())).
                        collect(Collectors.toList()));
        SequenceDictionaryIntersection sdi = new SequenceDictionaryIntersection(
                leftDict, leftDictFile.getName(), nonMatchingDict, "with prefixes");
        Assert.assertEquals(sdi.getIntersection().size(), 0);
        System.out.println("Terse with descriptions: " + sdi.message(false));
        System.out.println("Verbose with descriptions: " + sdi.message(true));

        sdi = new SequenceDictionaryIntersection(leftDict, nonMatchingDict);
        Assert.assertEquals(sdi.getIntersection().size(), 0);
        System.out.println("Terse without descriptions: " + sdi.message(false) + "\n");
        System.out.println("Verbose without descriptions: " + sdi.message(true) + "\n");
    } finally {
        CloserUtil.close(leftDict);
    }
}
 
Example #9
Source File: SequenceDictionaryUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" )
public Object[][] testSequenceRecordsAreEquivalentDataProvider() {
    final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571);
    final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309);
    final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
    final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456);
    return new Object[][]{
            {CHR1_HG19, CHR1_HG19, true},
            {CHR1_HG19, CHRM_HG19, false},
            {CHR1_HG19, CHR_NONSTANDARD1, false},
            {null, null, true},
            {CHR1_HG19, null, false},
            {null, CHR1_HG19, false},
            {CHR1_HG19, CHR1_HG19_WITH_UNKNOWN_LENGTH, true},
            {CHR1_HG19, CHR1_HG19_WITH_DIFFERENT_LENGTH, false},
            {CHR1_HG19_WITH_UNKNOWN_LENGTH, CHR1_HG19, true},
            {CHR1_HG19_WITH_DIFFERENT_LENGTH, CHR1_HG19, false},
    };
}
 
Example #10
Source File: ShardUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@DataProvider(name = "DivideIntervalIntoShardsInvalidTestData")
public Object[][] divideIntervalIntoShardsInvalidTestData() {
    final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Arrays.asList(new SAMSequenceRecord("1", 16000)));

    return new Object[][] {
            // contig not in dictionary
            { new SimpleInterval("2", 1, 10), 1, 1, 0, dictionary },
            // interval not within contig bounds
            { new SimpleInterval("1", 1, 17000), 1, 1, 0, dictionary },
            // shardSize == 0
            { new SimpleInterval("1", 1, 10), 0, 1, 0, dictionary },
            // shardSize < 0
            { new SimpleInterval("1", 1, 10), -1, 1, 0, dictionary },
            // shardStep == 0
            { new SimpleInterval("1", 1, 10), 1, 0, 0, dictionary },
            // shardStep < 0
            { new SimpleInterval("1", 1, 10), 1, -1, 0, dictionary },
            // shardPadding < 0
            { new SimpleInterval("1", 1, 10), 1, 1, -1, dictionary },
            // null interval
            { null, 1, 1, 0, dictionary },
            // null dictionary
            { new SimpleInterval("1", 1, 10), 1, 1, 0, null }
    };
}
 
Example #11
Source File: PicardIndexedFastaSequenceFile.java    From chipster with MIT License 6 votes vote down vote up
/**
 * Do some basic checking to make sure the dictionary and the index match.
 * @param fastaFile Used for error reporting only.
 * @param sequenceDictionary sequence dictionary to check against the index.
 * @param index index file to check against the dictionary.
 */
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
                                                        final SAMSequenceDictionary sequenceDictionary,
                                                        final FastaSequenceIndex index) {
    // Make sure dictionary and index are the same size.
    if( sequenceDictionary.getSequences().size() != index.size() )
        throw new SAMException("Sequence dictionary and index contain different numbers of contigs");

    Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
    Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();

    while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
        SAMSequenceRecord sequenceEntry = sequenceIterator.next();
        FastaSequenceIndexEntry indexEntry = indexIterator.next();

        if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
            throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
                    fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
        }

        // Make sure sequence length matches index length.
        if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
            throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
    }
}
 
Example #12
Source File: SequenceDictionaryUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Helper routine that returns whether two sequence records are equivalent, defined as having the same name and
 * lengths.
 *
 * NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH
 *
 * @param first first sequence record to compare
 * @param second second sequence record to compare
 * @return true if first and second have the same names and lengths, otherwise false
 */
public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) {
    if ( first == second ) {
        return true;
    }
    if ( first == null || second == null ) {
        return false;
    }
    final int length1 = first.getSequenceLength();
    final int length2 = second.getSequenceLength();

    if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){
        return false;
    }
    if (! first.getSequenceName().equals(second.getSequenceName())){
        return false;
    }
    return true;
}
 
Example #13
Source File: CollectSamErrorMetricsTest.java    From picard with MIT License 6 votes vote down vote up
@Test(dataProvider = "provideForTestHasDeletionBeenProcessed")
public void testHasDeletionBeenProcessed(final String cigarString, final List<Boolean> expected) {
    final Cigar cigar = TextCigarCodec.decode(cigarString);
    final SAMRecord deletionRecord = createRecordFromCigar(cigar.toString());

    final CollectSamErrorMetrics collectSamErrorMetrics = new CollectSamErrorMetrics();
    final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord("chr1", 2000);

    int position = 100;
    int iExpected = 0;

    for(CigarElement cigarElement : cigar.getCigarElements()) {
        for(int iCigarElementPosition = 0; iCigarElementPosition < cigarElement.getLength(); iCigarElementPosition++) {
            final SamLocusIterator.LocusInfo locusInfo = new SamLocusIterator.LocusInfo(samSequenceRecord, ++position);
            if(cigarElement.getOperator() == CigarOperator.D) {
                Assert.assertEquals(collectSamErrorMetrics.processDeletionLocus(new SamLocusIterator.RecordAndOffset(deletionRecord, 0), locusInfo), (boolean) expected.get(iExpected++));
            }
        }
    }
}
 
Example #14
Source File: GencodeFuncotationFactory.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Creates a map of Transcript IDs for use in looking up transcripts from the FASTA dictionary for the GENCODE Transcripts.
 * We include the start and stop codons in the transcripts so we can handle start/stop codon variants.
 * @param fastaReference The {@link ReferenceDataSource} corresponding to the Transcript FASTA file for this GENCODE dataset.
 * @return A {@link Map} of {@link String} -> {@link MappedTranscriptIdInfo} which maps real transcript IDs to the information about that transcript in the transcript FASTA file.
 */
@VisibleForTesting
static Map<String, MappedTranscriptIdInfo> createTranscriptIdMap(final ReferenceDataSource fastaReference) {

    final Map<String, MappedTranscriptIdInfo> idMap = new HashMap<>();

    for ( final SAMSequenceRecord sequence : fastaReference.getSequenceDictionary().getSequences() ) {

        final MappedTranscriptIdInfo transcriptInfo = createMappedTranscriptIdInfo( sequence );

        // The names in the file are actually in a list with | between each sequence name.
        // We need to split the names and add them to the dictionary so we can resolve them to the full
        // sequence name as it appears in the file:
        for ( final String transcriptId : Utils.split(sequence.getSequenceName(), "|") ) {
            idMap.put(transcriptId, transcriptInfo);
        }
    }

    return idMap;
}
 
Example #15
Source File: ReferenceSource.java    From cramtools with Apache License 2.0 6 votes vote down vote up
@Override
public synchronized byte[] getReferenceBases(SAMSequenceRecord record, boolean tryNameVariants) {
	byte[] bases = findBases(record, tryNameVariants);
	if (bases == null)
		return null;

	cacheW.put(record.getSequenceName(), new WeakReference<byte[]>(bases));

	String md5 = record.getAttribute(SAMSequenceRecord.MD5_TAG);
	if (md5 == null) {
		md5 = Utils.calculateMD5String(bases);
		record.setAttribute(SAMSequenceRecord.MD5_TAG, md5);
	}

	if (REF_CACHE != null)
		addToRefCache(md5, bases);

	return bases;
}
 
Example #16
Source File: SamRangeUtils.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * Resolves an inital range (supplied by the user, and may have unbounded ends) to the available sequences.
 * If end is greater than number of sequences it sets end to number of sequences.
 * @param range the range
 * @param dictionary the dictionary with which to validate/resolve the range
 * @return the resolved range.
 * @throws NoTalkbackSlimException if the start is out of range.
 */
public static SequenceNameLocus resolveRestriction(SAMSequenceDictionary dictionary, SequenceNameLocus range) {
  final SAMSequenceRecord sequence = dictionary.getSequence(range.getSequenceName());
  if (sequence == null) {
    throw new NoTalkbackSlimException("Sequence \"" + range.getSequenceName() + "\" referenced in region was not found in the SAM sequence dictionary.");
  }
  final int start = range.getStart() == SamRegionRestriction.MISSING ? 0 : range.getStart();
  final int length = sequence.getSequenceLength();
  if (start > length || (length != 0 && start == length)) {  // Allow start == 0 if empty sequence
    throw new NoTalkbackSlimException("The start position \"" + start + "\" must be less than than length of the sequence \"" + length + "\".");
  }
  int end = range.getEnd() == LongRange.MISSING ? length : range.getEnd();
  if (end > length) {
    Diagnostic.warning("The end position \"" + range.getEnd() + "\" is outside the length of the sequence (" + length
      + "). Defaulting end to \"" + length + "\"");
    end = length;
  }
  return new SequenceNameLocusSimple(range.getSequenceName(), start, end);
}
 
Example #17
Source File: SimpleIntervalCollectionUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@DataProvider(name = "simpleIntervalCollectionSortedOrderBadTestData")
public Object[][] getSimpleIntervalCollectionSortedOrderBadTestData() {
    final List<SAMSequenceRecord> extendedSAMSequenceRecords = new ArrayList<>(
            collection_0.getMetadata().getSequenceDictionary().getSequences());
    extendedSAMSequenceRecords.add(new SAMSequenceRecord("a_very_special_contig", 10));
    final SAMSequenceDictionary extendedSAMSequenceDictionary = new SAMSequenceDictionary(extendedSAMSequenceRecords);
    final SimpleIntervalCollection collection_0_bad = new SimpleIntervalCollection(
            new SimpleLocatableMetadata(extendedSAMSequenceDictionary),
            collection_0.getRecords());
    return new Object[][] { /* repeated collections */
            {Arrays.asList(collection_0, collection_0, collection_2)},
            {Arrays.asList(collection_2, collection_2, collection_2)},
            {Arrays.asList(collection_2, collection_1, collection_1)},
            {Arrays.asList(collection_0_bad, collection_1, collection_2)} /* collect_0_bad has different metadata */
    };
}
 
Example #18
Source File: PSFilterTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(groups = "spark")
public void testDoSetPairFlags() {

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final SAMSequenceDictionary seq = new SAMSequenceDictionary();
    seq.addSequence(new SAMSequenceRecord("test_seq", 1000));
    final SAMFileHeader header = new SAMFileHeader(seq);

    final List<GATKRead> readList = makeReadSet(header);
    final JavaRDD<GATKRead> reads = ctx.parallelize(readList);
    ;

    final List<GATKRead> result = PSFilter.setPairFlags(reads, 100).collect();

    Assert.assertEquals(result.size(), 6);
    for (final GATKRead read : result) {
        if (read.getName().equals("paired_1") || read.getName().equals("paired_2")) {
            Assert.assertTrue(read.isPaired());
        } else {
            Assert.assertFalse(read.isPaired());
        }
    }

}
 
Example #19
Source File: SmartSamWriterTest.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
private static SAMRecord createRecord(int position, String ref) {
  final SAMFileHeader header = new SAMFileHeader();
  header.getSequenceDictionary().addSequence(new SAMSequenceRecord("A", 100000));
  header.getSequenceDictionary().addSequence(new SAMSequenceRecord("B", 100000));
  final SAMRecord rec = new SAMRecord(header);
  rec.setReferenceName(ref);
  rec.setReadName("read");
  rec.setReadString("ACGT");
  rec.setBaseQualityString("####");
  if (ref.equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
    rec.setReadUnmappedFlag(true);
  } else {
    rec.setCigarString("4=");
    rec.setAlignmentStart(position);
  }
  return rec;
}
 
Example #20
Source File: AnnotateTargetsIntegrationTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private List<SimpleInterval> createRandomIntervals(final SAMSequenceDictionary referenceDictionary, final int numberOfIntervals, final int minIntervalSize, final int maxIntervalSize, final int meanIntervalSize, final double intervalSizeStdev) {
    final List<SimpleInterval> result = new ArrayList<>(numberOfIntervals);
    final int numberOfSequences = referenceDictionary.getSequences().size();
    for (int i = 0; i < numberOfIntervals; i++) {
        final SAMSequenceRecord contig = referenceDictionary.getSequence(RANDOM.nextInt(numberOfSequences));
        final String contigName = contig.getSequenceName();
        final int intervalSize = Math.min(maxIntervalSize, (int) Math.max(minIntervalSize, Math.round(RANDOM.nextDouble() * intervalSizeStdev + meanIntervalSize)));
        final int intervalStart = 1 + RANDOM.nextInt(contig.getSequenceLength() - intervalSize);
        final int intervalEnd = intervalStart + intervalSize - 1;
        final SimpleInterval interval = new SimpleInterval(contigName, intervalStart, intervalEnd);
        result.add(interval);
    }

    final Comparator<SimpleInterval> comparator =
            Comparator.comparing(SimpleInterval::getContig,
                    (a, b) -> Integer.compare(
                            referenceDictionary.getSequenceIndex(a),
                            referenceDictionary.getSequenceIndex(b)))
            .thenComparingInt(SimpleInterval::getStart)
            .thenComparingInt(SimpleInterval::getEnd);
    Collections.sort(result, comparator);
    return result;
}
 
Example #21
Source File: SamRegionRestrictionTest.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
public void testSomeMethod() {
  SamRegionRestriction srr = new SamRegionRestriction("fooo");
  final SAMSequenceDictionary sdd = new SAMSequenceDictionary();
  sdd.addSequence(new SAMSequenceRecord("fooo", 9876));
  assertEquals(0, srr.resolveStart());
  assertEquals(9876, srr.resolveEnd(sdd));
  assertEquals("fooo", srr.getSequenceName());
  assertEquals(RegionRestriction.MISSING, srr.getStart());
  assertEquals(RegionRestriction.MISSING, srr.getEnd());
  assertEquals("fooo", srr.toString());

  srr = new SamRegionRestriction("fooo", 75, 555);
  assertEquals(75, srr.resolveStart());
  assertEquals(555, srr.resolveEnd(sdd));
  assertEquals("fooo", srr.getSequenceName());
  assertEquals(75, srr.getStart());
  assertEquals(555, srr.getEnd());
  assertEquals("fooo:76-555", srr.toString());
  srr = new SamRegionRestriction("fooo", 75, RegionRestriction.MISSING);
  assertEquals("fooo:76", srr.toString());
}
 
Example #22
Source File: FindBreakpointEvidenceSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
static SVIntervalTree<SVInterval> findGenomewideHighCoverageIntervalsToIgnore(final FindBreakpointEvidenceSparkArgumentCollection params,
                                                                              final ReadMetadata readMetadata,
                                                                              final JavaSparkContext ctx,
                                                                              final SAMFileHeader header,
                                                                              final JavaRDD<GATKRead> unfilteredReads,
                                                                              final SVReadFilter filter,
                                                                              final Logger logger,
                                                                              final Broadcast<ReadMetadata> broadcastMetadata) {
    final int capacity = header.getSequenceDictionary().getSequences().stream()
            .mapToInt(seqRec -> (seqRec.getSequenceLength() + DEPTH_WINDOW_SIZE - 1)/DEPTH_WINDOW_SIZE).sum();
    final List<SVInterval> depthIntervals = new ArrayList<>(capacity);
    for (final SAMSequenceRecord sequenceRecord : header.getSequenceDictionary().getSequences()) {
        final int contigID = readMetadata.getContigID(sequenceRecord.getSequenceName());
        final int contigLength = sequenceRecord.getSequenceLength();
        for (int i = 1; i < contigLength; i = i + DEPTH_WINDOW_SIZE) {
            depthIntervals.add(new SVInterval(contigID, i, Math.min(contigLength, i + DEPTH_WINDOW_SIZE)));
        }
    }

    final List<SVInterval> highCoverageSubintervals = findHighCoverageSubintervalsAndLog(
            params, ctx, broadcastMetadata, depthIntervals, unfilteredReads, filter, logger);
    final SVIntervalTree<SVInterval> highCoverageSubintervalTree = new SVIntervalTree<>();
    highCoverageSubintervals.forEach(i -> highCoverageSubintervalTree.put(i, i));

    return highCoverageSubintervalTree;
}
 
Example #23
Source File: RandomDNA.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Creates a random reference and writes it in FASTA format into a {@link Writer}.
 * @param out the output writer.
 * @param dict the dictionary indicating the number of contigs and their lengths.
 * @param basesPerLine number of base to print in each line of the output FASTA file.
 *
 * @throws IOException if such an exception was thrown while accessing and writing into the temporal file.
 * @throws IllegalArgumentException if {@code dict} is {@code null}, or {@code out } is {@code null}
 *    or {@code basesPerLine} is 0 or negative.
 */
public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final int basesPerLine)
        throws IOException {
    Utils.nonNull(out);
    Utils.nonNull(dict);
    ParamUtils.isPositive(basesPerLine, "number of base per line must be strictly positive: " + basesPerLine);
    final byte[] buffer = new byte[basesPerLine];
    final String lineSeparator = System.lineSeparator();
    for (final SAMSequenceRecord sequence : dict.getSequences()) {
        int pendingBases = sequence.getSequenceLength();
        out.append(">").append(sequence.getSequenceName()).append(lineSeparator);
        while (pendingBases > 0) {
            final int lineLength = pendingBases < basesPerLine ? pendingBases : basesPerLine;
            nextBases(buffer, 0, lineLength);
            out.append(new String(buffer, 0, lineLength)).append(lineSeparator);
            pendingBases -= lineLength;
        }
    }
}
 
Example #24
Source File: SequenceDictionaryUtils.java    From picard with MIT License 5 votes vote down vote up
@Override
public SAMSequenceRecord next() {
    if (!hasNext()) {
        throw new NoSuchElementException("next() was called when hasNext() was false.");
    }
    final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(nextRefSeq);
    nextRefSeq = refSeqFile.nextSequence();
    return samSequenceRecord;
}
 
Example #25
Source File: IntervalUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@DataProvider(name = "IntervalIsOnDictionaryContigData")
public Object[][] intervalIsOnDictionaryContigData() {
    final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Arrays.asList(new SAMSequenceRecord("1", 1000)));

    return new Object[][] {
            { new SimpleInterval("1", 1, 10), dictionary, true },
            { new SimpleInterval("1", 50, 100), dictionary, true },
            { new SimpleInterval("1", 1, 1000), dictionary, true },
            { new SimpleInterval("2", 1, 10), dictionary, false },
            { new SimpleInterval("1", 1, 1001), dictionary, false },
            { new SimpleInterval("1", 1, 2000), dictionary, false }
    };
}
 
Example #26
Source File: IntervalUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Determines whether the provided interval is within the bounds of its assigned contig according to the provided dictionary
 *
 * @param interval interval to check
 * @param dictionary dictionary to use to validate contig bounds
 * @return true if the interval's contig exists in the dictionary, and the interval is within its bounds, otherwise false
 */
public static boolean intervalIsOnDictionaryContig( final SimpleInterval interval, final SAMSequenceDictionary dictionary ) {
    Utils.nonNull(interval);
    Utils.nonNull(dictionary);

    final SAMSequenceRecord contigRecord = dictionary.getSequence(interval.getContig());
    if ( contigRecord == null ) {
        return false;
    }

    return interval.getEnd() <= contigRecord.getSequenceLength();
}
 
Example #27
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "fastas")
public void testCachingIndexedFastaReaderTwoStage(Path fasta, Path unzipped, int cacheSize, int querySize) throws IOException {
    try(final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.debug(String.format(
                "Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for (int i = 0; i < contig.getSequenceLength(); i += 10) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart,
                                                                        middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }
}
 
Example #28
Source File: UpdateVCFSequenceDictionary.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) {
    // Validate each variant against the source dictionary manually
    SAMSequenceRecord samSeqRec = sourceDictionary.getSequence(vc.getContig());
    if (samSeqRec == null) {
        throw new CommandLineException.BadArgumentValue(
            String.format(
                "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
                "that is not present in the provided dictionary",
                vc.getID(),
                vc.getContig()
            )
        );
    } else if (vc.getEnd() > samSeqRec.getSequenceLength()) {
        throw new CommandLineException.BadArgumentValue(
            String.format(
                "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
                "that ends at a position (%d) that exceeds the length of that sequence (%d) in the provided dictionary",
                vc.getID(),
                vc.getContig(),
                vc.getEnd(),
                samSeqRec.getSequenceLength()
            )
        );
    }
    vcfWriter.add(vc);
}
 
Example #29
Source File: ValidateReference.java    From Drop-seq with MIT License 5 votes vote down vote up
private SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final ReferenceSequenceFile refSeqFile =
            ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, true);
    ReferenceSequence refSeq;
    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    while ((refSeq = refSeqFile.nextSequence()) != null) {
        if (sequenceNames.contains(refSeq.getName())) {
            throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName());
        }
        sequenceNames.add(refSeq.getName());
        ret.add(new SAMSequenceRecord(refSeq.getName(), refSeq.length()));
    }
    return new SAMSequenceDictionary(ret);
}
 
Example #30
Source File: PSBuildReferenceTaxonomyUtilsTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testParseReferenceRecords() {

    final Map<Integer, PSPathogenReferenceTaxonProperties> taxIdToProperties = new HashMap<>();

    final List<SAMSequenceRecord> dictList = new ArrayList<>();

    dictList.add(new SAMSequenceRecord("record1|test", 500));
    dictList.add(new SAMSequenceRecord("record2|taxid|1", 5000));
    dictList.add(new SAMSequenceRecord("record3|taxid|2|", 1000));
    dictList.add(new SAMSequenceRecord("record4|ref|NC_1", 2000));
    dictList.add(new SAMSequenceRecord("record5|ref|NC_2|taxid|1", 1000));

    Map<String, Tuple2<String, Long>> result = PSBuildReferenceTaxonomyUtils.parseReferenceRecords(dictList, taxIdToProperties);

    Assert.assertNotNull(result);
    Assert.assertEquals(result.size(), 2);
    Assert.assertTrue(result.containsKey("record1"));
    Assert.assertEquals(result.get("record1")._1, "record1|test");
    Assert.assertEquals((long) result.get("record1")._2, 500);
    Assert.assertTrue(result.containsKey("NC_1"));
    Assert.assertEquals(result.get("NC_1")._1, "record4|ref|NC_1");
    Assert.assertEquals((long) result.get("NC_1")._2, 2000);

    Assert.assertEquals(taxIdToProperties.size(), 2);
    Assert.assertTrue(taxIdToProperties.containsKey(1));
    Assert.assertEquals(taxIdToProperties.get(1).getAccessions().size(), 2);
    Assert.assertTrue(taxIdToProperties.get(1).getAccessions().contains("record2|taxid|1"));
    Assert.assertTrue(taxIdToProperties.get(1).getAccessions().contains("record5|ref|NC_2|taxid|1"));
    Assert.assertEquals(taxIdToProperties.get(1).getTotalLength(), 6000);
    Assert.assertTrue(taxIdToProperties.containsKey(2));
    Assert.assertEquals(taxIdToProperties.get(2).getAccessions().size(), 1);
    Assert.assertTrue(taxIdToProperties.get(2).getAccessions().contains("record3|taxid|2|"));
    Assert.assertEquals(taxIdToProperties.get(2).getTotalLength(), 1000);
}