Java Code Examples for htsjdk.samtools.SAMSequenceRecord

The following examples show how to use htsjdk.samtools.SAMSequenceRecord. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: gatk   Source File: RandomDNA.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Creates a random reference and writes it in FASTA format into a {@link Writer}.
 * @param out the output writer.
 * @param dict the dictionary indicating the number of contigs and their lengths.
 * @param basesPerLine number of base to print in each line of the output FASTA file.
 *
 * @throws IOException if such an exception was thrown while accessing and writing into the temporal file.
 * @throws IllegalArgumentException if {@code dict} is {@code null}, or {@code out } is {@code null}
 *    or {@code basesPerLine} is 0 or negative.
 */
public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final int basesPerLine)
        throws IOException {
    Utils.nonNull(out);
    Utils.nonNull(dict);
    ParamUtils.isPositive(basesPerLine, "number of base per line must be strictly positive: " + basesPerLine);
    final byte[] buffer = new byte[basesPerLine];
    final String lineSeparator = System.lineSeparator();
    for (final SAMSequenceRecord sequence : dict.getSequences()) {
        int pendingBases = sequence.getSequenceLength();
        out.append(">").append(sequence.getSequenceName()).append(lineSeparator);
        while (pendingBases > 0) {
            final int lineLength = pendingBases < basesPerLine ? pendingBases : basesPerLine;
            nextBases(buffer, 0, lineLength);
            out.append(new String(buffer, 0, lineLength)).append(lineSeparator);
            pendingBases -= lineLength;
        }
    }
}
 
Example 2
Source Project: Drop-seq   Source File: MaskReferenceSequence.java    License: MIT License 6 votes vote down vote up
private void processByPartialContig (final ReferenceSequenceFile ref, final FastaSequenceFileWriter writer, final File intervalListFile) {
	SAMSequenceDictionary sd = ref.getSequenceDictionary();
	// validate that the intervals and the reference have the same sequence dictionary.
	IntervalList iList = IntervalList.fromFile(intervalListFile);
	iList.getHeader().getSequenceDictionary().assertSameDictionary(sd);
	// map the intervals to a map to each contig.
	Map<String, List<Interval>> intervalsPerContig = getIntervalsForContig(iList);

	for (SAMSequenceRecord r: sd.getSequences()) {
		String contig = r.getSequenceName();
		log.info("Processing partial contig " + contig);
		// this list can be null.
		List<Interval> intervalsToMask = intervalsPerContig.get(contig);
		ReferenceSequence rs = ref.getSequence(contig);
		writeSequence(rs, intervalsToMask, writer);
	}
}
 
Example 3
Source Project: Drop-seq   Source File: SequenceDictionaryIntersectionTest.java    License: MIT License 6 votes vote down vote up
@Test
public void testNoMatch() {
    final File leftDictFile = new File(TESTDATA_DIR, CHR_BASENAME + "sam");
    final SamReader leftDict = SamReaderFactory.makeDefault().open(leftDictFile);
    try {
        final SAMSequenceDictionary nonMatchingDict = new SAMSequenceDictionary(
                leftDict.getFileHeader().getSequenceDictionary().getSequences().stream().
                        map((s) -> new SAMSequenceRecord("xyz_" + s.getSequenceName(), s.getSequenceLength())).
                        collect(Collectors.toList()));
        SequenceDictionaryIntersection sdi = new SequenceDictionaryIntersection(
                leftDict, leftDictFile.getName(), nonMatchingDict, "with prefixes");
        Assert.assertEquals(sdi.getIntersection().size(), 0);
        System.out.println("Terse with descriptions: " + sdi.message(false));
        System.out.println("Verbose with descriptions: " + sdi.message(true));

        sdi = new SequenceDictionaryIntersection(leftDict, nonMatchingDict);
        Assert.assertEquals(sdi.getIntersection().size(), 0);
        System.out.println("Terse without descriptions: " + sdi.message(false) + "\n");
        System.out.println("Verbose without descriptions: " + sdi.message(true) + "\n");
    } finally {
        CloserUtil.close(leftDict);
    }
}
 
Example 4
@NotNull
public static List<ChromosomeLength> create(@NotNull final SAMSequenceDictionary dictionary) {
    final List<ChromosomeLength> results = Lists.newArrayList();

    for (final SAMSequenceRecord samSequenceRecord : dictionary.getSequences()) {
        final String sequenceName = samSequenceRecord.getSequenceName();
        if (HumanChromosome.contains(sequenceName)) {
            results.add(ImmutableChromosomeLength.builder()
                    .chromosome(sequenceName)
                    .length(samSequenceRecord.getSequenceLength())
                    .build());
        }
    }

    return results;
}
 
Example 5
/**
 * Helper routine that returns whether two sequence records are equivalent, defined as having the same name and
 * lengths.
 *
 * NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH
 *
 * @param first first sequence record to compare
 * @param second second sequence record to compare
 * @return true if first and second have the same names and lengths, otherwise false
 */
public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) {
    if ( first == second ) {
        return true;
    }
    if ( first == null || second == null ) {
        return false;
    }
    final int length1 = first.getSequenceLength();
    final int length2 = second.getSequenceLength();

    if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){
        return false;
    }
    if (! first.getSequenceName().equals(second.getSequenceName())){
        return false;
    }
    return true;
}
 
Example 6
private List<SimpleInterval> createRandomIntervals(final SAMSequenceDictionary referenceDictionary, final int numberOfIntervals, final int minIntervalSize, final int maxIntervalSize, final int meanIntervalSize, final double intervalSizeStdev) {
    final List<SimpleInterval> result = new ArrayList<>(numberOfIntervals);
    final int numberOfSequences = referenceDictionary.getSequences().size();
    for (int i = 0; i < numberOfIntervals; i++) {
        final SAMSequenceRecord contig = referenceDictionary.getSequence(RANDOM.nextInt(numberOfSequences));
        final String contigName = contig.getSequenceName();
        final int intervalSize = Math.min(maxIntervalSize, (int) Math.max(minIntervalSize, Math.round(RANDOM.nextDouble() * intervalSizeStdev + meanIntervalSize)));
        final int intervalStart = 1 + RANDOM.nextInt(contig.getSequenceLength() - intervalSize);
        final int intervalEnd = intervalStart + intervalSize - 1;
        final SimpleInterval interval = new SimpleInterval(contigName, intervalStart, intervalEnd);
        result.add(interval);
    }

    final Comparator<SimpleInterval> comparator =
            Comparator.comparing(SimpleInterval::getContig,
                    (a, b) -> Integer.compare(
                            referenceDictionary.getSequenceIndex(a),
                            referenceDictionary.getSequenceIndex(b)))
            .thenComparingInt(SimpleInterval::getStart)
            .thenComparingInt(SimpleInterval::getEnd);
    Collections.sort(result, comparator);
    return result;
}
 
Example 7
/**
 * Creates a map of Transcript IDs for use in looking up transcripts from the FASTA dictionary for the GENCODE Transcripts.
 * We include the start and stop codons in the transcripts so we can handle start/stop codon variants.
 * @param fastaReference The {@link ReferenceDataSource} corresponding to the Transcript FASTA file for this GENCODE dataset.
 * @return A {@link Map} of {@link String} -> {@link MappedTranscriptIdInfo} which maps real transcript IDs to the information about that transcript in the transcript FASTA file.
 */
@VisibleForTesting
static Map<String, MappedTranscriptIdInfo> createTranscriptIdMap(final ReferenceDataSource fastaReference) {

    final Map<String, MappedTranscriptIdInfo> idMap = new HashMap<>();

    for ( final SAMSequenceRecord sequence : fastaReference.getSequenceDictionary().getSequences() ) {

        final MappedTranscriptIdInfo transcriptInfo = createMappedTranscriptIdInfo( sequence );

        // The names in the file are actually in a list with | between each sequence name.
        // We need to split the names and add them to the dictionary so we can resolve them to the full
        // sequence name as it appears in the file:
        for ( final String transcriptId : Utils.split(sequence.getSequenceName(), "|") ) {
            idMap.put(transcriptId, transcriptInfo);
        }
    }

    return idMap;
}
 
Example 8
Source Project: cramtools   Source File: ReferenceSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public synchronized byte[] getReferenceBases(SAMSequenceRecord record, boolean tryNameVariants) {
	byte[] bases = findBases(record, tryNameVariants);
	if (bases == null)
		return null;

	cacheW.put(record.getSequenceName(), new WeakReference<byte[]>(bases));

	String md5 = record.getAttribute(SAMSequenceRecord.MD5_TAG);
	if (md5 == null) {
		md5 = Utils.calculateMD5String(bases);
		record.setAttribute(SAMSequenceRecord.MD5_TAG, md5);
	}

	if (REF_CACHE != null)
		addToRefCache(md5, bases);

	return bases;
}
 
Example 9
Source Project: rtg-tools   Source File: SamRangeUtils.java    License: BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * Resolves an inital range (supplied by the user, and may have unbounded ends) to the available sequences.
 * If end is greater than number of sequences it sets end to number of sequences.
 * @param range the range
 * @param dictionary the dictionary with which to validate/resolve the range
 * @return the resolved range.
 * @throws NoTalkbackSlimException if the start is out of range.
 */
public static SequenceNameLocus resolveRestriction(SAMSequenceDictionary dictionary, SequenceNameLocus range) {
  final SAMSequenceRecord sequence = dictionary.getSequence(range.getSequenceName());
  if (sequence == null) {
    throw new NoTalkbackSlimException("Sequence \"" + range.getSequenceName() + "\" referenced in region was not found in the SAM sequence dictionary.");
  }
  final int start = range.getStart() == SamRegionRestriction.MISSING ? 0 : range.getStart();
  final int length = sequence.getSequenceLength();
  if (start > length || (length != 0 && start == length)) {  // Allow start == 0 if empty sequence
    throw new NoTalkbackSlimException("The start position \"" + start + "\" must be less than than length of the sequence \"" + length + "\".");
  }
  int end = range.getEnd() == LongRange.MISSING ? length : range.getEnd();
  if (end > length) {
    Diagnostic.warning("The end position \"" + range.getEnd() + "\" is outside the length of the sequence (" + length
      + "). Defaulting end to \"" + length + "\"");
    end = length;
  }
  return new SequenceNameLocusSimple(range.getSequenceName(), start, end);
}
 
Example 10
private static SAMRecord createRecord(int position, String ref) {
  final SAMFileHeader header = new SAMFileHeader();
  header.getSequenceDictionary().addSequence(new SAMSequenceRecord("A", 100000));
  header.getSequenceDictionary().addSequence(new SAMSequenceRecord("B", 100000));
  final SAMRecord rec = new SAMRecord(header);
  rec.setReferenceName(ref);
  rec.setReadName("read");
  rec.setReadString("ACGT");
  rec.setBaseQualityString("####");
  if (ref.equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
    rec.setReadUnmappedFlag(true);
  } else {
    rec.setCigarString("4=");
    rec.setAlignmentStart(position);
  }
  return rec;
}
 
Example 11
public void testSomeMethod() {
  SamRegionRestriction srr = new SamRegionRestriction("fooo");
  final SAMSequenceDictionary sdd = new SAMSequenceDictionary();
  sdd.addSequence(new SAMSequenceRecord("fooo", 9876));
  assertEquals(0, srr.resolveStart());
  assertEquals(9876, srr.resolveEnd(sdd));
  assertEquals("fooo", srr.getSequenceName());
  assertEquals(RegionRestriction.MISSING, srr.getStart());
  assertEquals(RegionRestriction.MISSING, srr.getEnd());
  assertEquals("fooo", srr.toString());

  srr = new SamRegionRestriction("fooo", 75, 555);
  assertEquals(75, srr.resolveStart());
  assertEquals(555, srr.resolveEnd(sdd));
  assertEquals("fooo", srr.getSequenceName());
  assertEquals(75, srr.getStart());
  assertEquals(555, srr.getEnd());
  assertEquals("fooo:76-555", srr.toString());
  srr = new SamRegionRestriction("fooo", 75, RegionRestriction.MISSING);
  assertEquals("fooo:76", srr.toString());
}
 
Example 12
Source Project: picard   Source File: CreateSequenceDictionary.java    License: MIT License 6 votes vote down vote up
/**
 * Read all the sequences from the given reference file, and convert into SAMSequenceRecords
 *
 * @param referenceFile fasta or fasta.gz
 * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
 */
public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final Iterable<SAMSequenceRecord> samSequenceRecordsIterable = getSamSequenceRecordsIterable();

    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    for (SAMSequenceRecord rec : samSequenceRecordsIterable) {

        if (sequenceNames.contains(rec.getSequenceName())) {
            throw new PicardException("Sequence name appears more than once in reference: " + rec.getSequenceName());
        }
        sequenceNames.add(rec.getSequenceName());
        ret.add(rec);
    }
    return new SAMSequenceDictionary(ret);
}
 
Example 13
static SVIntervalTree<SVInterval> findGenomewideHighCoverageIntervalsToIgnore(final FindBreakpointEvidenceSparkArgumentCollection params,
                                                                              final ReadMetadata readMetadata,
                                                                              final JavaSparkContext ctx,
                                                                              final SAMFileHeader header,
                                                                              final JavaRDD<GATKRead> unfilteredReads,
                                                                              final SVReadFilter filter,
                                                                              final Logger logger,
                                                                              final Broadcast<ReadMetadata> broadcastMetadata) {
    final int capacity = header.getSequenceDictionary().getSequences().stream()
            .mapToInt(seqRec -> (seqRec.getSequenceLength() + DEPTH_WINDOW_SIZE - 1)/DEPTH_WINDOW_SIZE).sum();
    final List<SVInterval> depthIntervals = new ArrayList<>(capacity);
    for (final SAMSequenceRecord sequenceRecord : header.getSequenceDictionary().getSequences()) {
        final int contigID = readMetadata.getContigID(sequenceRecord.getSequenceName());
        final int contigLength = sequenceRecord.getSequenceLength();
        for (int i = 1; i < contigLength; i = i + DEPTH_WINDOW_SIZE) {
            depthIntervals.add(new SVInterval(contigID, i, Math.min(contigLength, i + DEPTH_WINDOW_SIZE)));
        }
    }

    final List<SVInterval> highCoverageSubintervals = findHighCoverageSubintervalsAndLog(
            params, ctx, broadcastMetadata, depthIntervals, unfilteredReads, filter, logger);
    final SVIntervalTree<SVInterval> highCoverageSubintervalTree = new SVIntervalTree<>();
    highCoverageSubintervals.forEach(i -> highCoverageSubintervalTree.put(i, i));

    return highCoverageSubintervalTree;
}
 
Example 14
Source Project: gatk   Source File: PSFilterTest.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(groups = "spark")
public void testDoSetPairFlags() {

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final SAMSequenceDictionary seq = new SAMSequenceDictionary();
    seq.addSequence(new SAMSequenceRecord("test_seq", 1000));
    final SAMFileHeader header = new SAMFileHeader(seq);

    final List<GATKRead> readList = makeReadSet(header);
    final JavaRDD<GATKRead> reads = ctx.parallelize(readList);
    ;

    final List<GATKRead> result = PSFilter.setPairFlags(reads, 100).collect();

    Assert.assertEquals(result.size(), 6);
    for (final GATKRead read : result) {
        if (read.getName().equals("paired_1") || read.getName().equals("paired_2")) {
            Assert.assertTrue(read.isPaired());
        } else {
            Assert.assertFalse(read.isPaired());
        }
    }

}
 
Example 15
@DataProvider(name = "simpleIntervalCollectionSortedOrderBadTestData")
public Object[][] getSimpleIntervalCollectionSortedOrderBadTestData() {
    final List<SAMSequenceRecord> extendedSAMSequenceRecords = new ArrayList<>(
            collection_0.getMetadata().getSequenceDictionary().getSequences());
    extendedSAMSequenceRecords.add(new SAMSequenceRecord("a_very_special_contig", 10));
    final SAMSequenceDictionary extendedSAMSequenceDictionary = new SAMSequenceDictionary(extendedSAMSequenceRecords);
    final SimpleIntervalCollection collection_0_bad = new SimpleIntervalCollection(
            new SimpleLocatableMetadata(extendedSAMSequenceDictionary),
            collection_0.getRecords());
    return new Object[][] { /* repeated collections */
            {Arrays.asList(collection_0, collection_0, collection_2)},
            {Arrays.asList(collection_2, collection_2, collection_2)},
            {Arrays.asList(collection_2, collection_1, collection_1)},
            {Arrays.asList(collection_0_bad, collection_1, collection_2)} /* collect_0_bad has different metadata */
    };
}
 
Example 16
Source Project: picard   Source File: CollectSamErrorMetricsTest.java    License: MIT License 6 votes vote down vote up
@Test(dataProvider = "provideForTestHasDeletionBeenProcessed")
public void testHasDeletionBeenProcessed(final String cigarString, final List<Boolean> expected) {
    final Cigar cigar = TextCigarCodec.decode(cigarString);
    final SAMRecord deletionRecord = createRecordFromCigar(cigar.toString());

    final CollectSamErrorMetrics collectSamErrorMetrics = new CollectSamErrorMetrics();
    final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord("chr1", 2000);

    int position = 100;
    int iExpected = 0;

    for(CigarElement cigarElement : cigar.getCigarElements()) {
        for(int iCigarElementPosition = 0; iCigarElementPosition < cigarElement.getLength(); iCigarElementPosition++) {
            final SamLocusIterator.LocusInfo locusInfo = new SamLocusIterator.LocusInfo(samSequenceRecord, ++position);
            if(cigarElement.getOperator() == CigarOperator.D) {
                Assert.assertEquals(collectSamErrorMetrics.processDeletionLocus(new SamLocusIterator.RecordAndOffset(deletionRecord, 0), locusInfo), (boolean) expected.get(iExpected++));
            }
        }
    }
}
 
Example 17
Source Project: chipster   Source File: PicardIndexedFastaSequenceFile.java    License: MIT License 6 votes vote down vote up
/**
 * Do some basic checking to make sure the dictionary and the index match.
 * @param fastaFile Used for error reporting only.
 * @param sequenceDictionary sequence dictionary to check against the index.
 * @param index index file to check against the dictionary.
 */
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
                                                        final SAMSequenceDictionary sequenceDictionary,
                                                        final FastaSequenceIndex index) {
    // Make sure dictionary and index are the same size.
    if( sequenceDictionary.getSequences().size() != index.size() )
        throw new SAMException("Sequence dictionary and index contain different numbers of contigs");

    Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
    Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();

    while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
        SAMSequenceRecord sequenceEntry = sequenceIterator.next();
        FastaSequenceIndexEntry indexEntry = indexIterator.next();

        if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
            throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
                    fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
        }

        // Make sure sequence length matches index length.
        if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
            throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
    }
}
 
Example 18
Source Project: gatk   Source File: ShardUnitTest.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@DataProvider(name = "DivideIntervalIntoShardsInvalidTestData")
public Object[][] divideIntervalIntoShardsInvalidTestData() {
    final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Arrays.asList(new SAMSequenceRecord("1", 16000)));

    return new Object[][] {
            // contig not in dictionary
            { new SimpleInterval("2", 1, 10), 1, 1, 0, dictionary },
            // interval not within contig bounds
            { new SimpleInterval("1", 1, 17000), 1, 1, 0, dictionary },
            // shardSize == 0
            { new SimpleInterval("1", 1, 10), 0, 1, 0, dictionary },
            // shardSize < 0
            { new SimpleInterval("1", 1, 10), -1, 1, 0, dictionary },
            // shardStep == 0
            { new SimpleInterval("1", 1, 10), 1, 0, 0, dictionary },
            // shardStep < 0
            { new SimpleInterval("1", 1, 10), 1, -1, 0, dictionary },
            // shardPadding < 0
            { new SimpleInterval("1", 1, 10), 1, 1, -1, dictionary },
            // null interval
            { null, 1, 1, 0, dictionary },
            // null dictionary
            { new SimpleInterval("1", 1, 10), 1, 1, 0, null }
    };
}
 
Example 19
/**
 * Create one SAMSequenceRecord from a single fasta sequence
 */
private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) {
  final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), refSeq.length());

  // Compute MD5 of upcased bases
  final byte[] bases = refSeq.getBases();
  for (int i = 0; i < bases.length; ++i) {
    bases[i] = StringUtil.toUpperCase(bases[i]);
  }

  ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
  if (GENOME_ASSEMBLY != null) {
    ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
  }
  ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
  if (SPECIES != null) {
    ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
  }
  return ret;
}
 
Example 20
@DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" )
public Object[][] testSequenceRecordsAreEquivalentDataProvider() {
    final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571);
    final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309);
    final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
    final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456);
    return new Object[][]{
            {CHR1_HG19, CHR1_HG19, true},
            {CHR1_HG19, CHRM_HG19, false},
            {CHR1_HG19, CHR_NONSTANDARD1, false},
            {null, null, true},
            {CHR1_HG19, null, false},
            {null, CHR1_HG19, false},
            {CHR1_HG19, CHR1_HG19_WITH_UNKNOWN_LENGTH, true},
            {CHR1_HG19, CHR1_HG19_WITH_DIFFERENT_LENGTH, false},
            {CHR1_HG19_WITH_UNKNOWN_LENGTH, CHR1_HG19, true},
            {CHR1_HG19_WITH_DIFFERENT_LENGTH, CHR1_HG19, false},
    };
}
 
Example 21
@Test(expectedExceptions={UserException.CouldNotReadInputFile.class, UserException.MalformedGenomeLoc.class, UserException.MalformedFile.class}, dataProvider="invalidIntervalTestData")
public void testIntervalFileToListInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser,
                                   String contig, int intervalStart, int intervalEnd ) throws Exception {

    final SAMFileHeader picardFileHeader = new SAMFileHeader();
    picardFileHeader.addSequence(genomeLocParser.getContigInfo("1"));
    picardFileHeader.addSequence(new SAMSequenceRecord("2", 100000));

    final IntervalList picardIntervals = new IntervalList(picardFileHeader);
    picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname"));

    final File picardIntervalFile = createTempFile("testIntervalFileToListInvalidPicardIntervalHandling", ".interval_list");
    picardIntervals.write(picardIntervalFile);

    // loadIntervals() will validate all intervals against the sequence dictionary in our genomeLocParser,
    // and should throw for all intervals in our invalidIntervalTestData set
    IntervalUtils.parseIntervalArguments(genomeLocParser, picardIntervalFile.getAbsolutePath());
}
 
Example 22
Source Project: cramtools   Source File: Merge.java    License: Apache License 2.0 6 votes vote down vote up
private static SAMFileHeader mergeHeaders(List<RecordSource> sources) {
	SAMFileHeader header = new SAMFileHeader();
	for (RecordSource source : sources) {
		SAMFileHeader h = source.reader.getFileHeader();

		for (SAMSequenceRecord seq : h.getSequenceDictionary().getSequences()) {
			if (header.getSequenceDictionary().getSequence(seq.getSequenceName()) == null)
				header.addSequence(seq);
		}

		for (SAMProgramRecord pro : h.getProgramRecords()) {
			if (header.getProgramRecord(pro.getProgramGroupId()) == null)
				header.addProgramRecord(pro);
		}

		for (String comment : h.getComments())
			header.addComment(comment);

		for (SAMReadGroupRecord rg : h.getReadGroups()) {
			if (header.getReadGroup(rg.getReadGroupId()) == null)
				header.addReadGroup(rg);
		}
	}

	return header;
}
 
Example 23
/**
 * Returns a compact String representation of the sequence dictionary it's passed
 *
 * The format of the returned String is:
 * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ]
 *
 * @param dict a non-null SAMSequenceDictionary
 * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
 */
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
    Utils.nonNull(dict, "Sequence dictionary must be non-null");

    StringBuilder s = new StringBuilder("[ ");

    for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
        s.append(dictionaryEntry.getSequenceName());
        s.append("(length:");
        s.append(dictionaryEntry.getSequenceLength());
        s.append(") ");
    }

    s.append("]");

    return s.toString();
}
 
Example 24
Source Project: Drop-seq   Source File: ValidateReference.java    License: MIT License 5 votes vote down vote up
private SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final ReferenceSequenceFile refSeqFile =
            ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, true);
    ReferenceSequence refSeq;
    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    while ((refSeq = refSeqFile.nextSequence()) != null) {
        if (sequenceNames.contains(refSeq.getName())) {
            throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName());
        }
        sequenceNames.add(refSeq.getName());
        ret.add(new SAMSequenceRecord(refSeq.getName(), refSeq.length()));
    }
    return new SAMSequenceDictionary(ret);
}
 
Example 25
Source Project: cramtools   Source File: Merge.java    License: Apache License 2.0 5 votes vote down vote up
private void advance() {
	int candidateIndex = getIndexOfMinAlignment();
	if (candidateIndex < 0) {
		next = null;
	} else {
		next = records[candidateIndex];
		SAMSequenceRecord sequence = header.getSequence(next.getReferenceName());

		next.setHeader(header);

		next.setReferenceIndex(sequence.getSequenceIndex());

		next.setReadName(sources[candidateIndex].id + delim + next.getReadName());

		if (next.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
			next.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
		} else {
			SAMSequenceRecord mateSequence = header.getSequence(next.getMateReferenceName());
			next.setMateReferenceIndex(mateSequence.getSequenceIndex());
		}

		if (sources[candidateIndex].it.hasNext())
			records[candidateIndex] = sources[candidateIndex].it.next();
		else
			records[candidateIndex] = null;
	}
}
 
Example 26
Source Project: Drop-seq   Source File: SingleCellRnaSeqMetricsCollector.java    License: MIT License 5 votes vote down vote up
@Override
protected int doWork() {
	IOUtil.assertFileIsReadable(INPUT);
	IOUtil.assertFileIsWritable(OUTPUT);
	if (RIBOSOMAL_INTERVALS!=null) IOUtil.assertFileIsReadable(RIBOSOMAL_INTERVALS);

	for (final String mtSequence : MT_SEQUENCE) {
		final SAMSequenceRecord samSequenceRecord =
				SamReaderFactory.makeDefault().open(INPUT).getFileHeader().getSequence(mtSequence);
		if (samSequenceRecord == null)
			throw new RuntimeException("MT_SEQUENCE '" + mtSequence + "' is not found in sequence dictionary in " + INPUT.getAbsolutePath());
	}

	List<String> cellBarcodes = getCellBarcodes(this.CELL_BC_FILE, this.INPUT, this.CELL_BARCODE_TAG, this.READ_MQ, this.NUM_CORE_BARCODES);
	RnaSeqMetricsCollector collector = getRNASeqMetricsCollector(this.CELL_BARCODE_TAG, cellBarcodes, this.INPUT, this.STRAND_SPECIFICITY, this.RRNA_FRAGMENT_PERCENTAGE, this.READ_MQ, this.ANNOTATIONS_FILE, this.RIBOSOMAL_INTERVALS);
	final MetricsFile<RnaSeqMetrics, Integer> file = getMetricsFile();
	log.info("Adding metrics to file.  This may take a while, with no progress messages.");
   	collector.addAllLevelsToFile(file);

   	BufferedWriter b = IOUtil.openFileForBufferedWriting(OUTPUT);
   	file.write(b);
   	try {
		b.close();
	} catch (IOException io) {
		throw new TranscriptomeException("Problem writing file", io);
	}
   	return 0;
   }
 
Example 27
/**
 * Gets the subset of SAMSequenceRecords in commonContigs in dict
 *
 * @param commonContigs
 * @param dict
 * @return
 */
private static List<SAMSequenceRecord> getSequencesOfName(Set<String> commonContigs, SAMSequenceDictionary dict) {
    List<SAMSequenceRecord> l = new ArrayList<>(commonContigs.size());
    for ( String name : commonContigs ) {
        l.add(dict.getSequence(name) );
    }

    return l;
}
 
Example 28
Source Project: Drop-seq   Source File: MaskReferenceSequence.java    License: MIT License 5 votes vote down vote up
private Set<String> selectContigsToIgnore (final SAMSequenceDictionary sd, final List<String> patterns) {
	Set<String> result = new HashSet<>();
	for (SAMSequenceRecord r:  sd.getSequences()) {
		String contigName = r.getSequenceName();
		if (containsSubString(contigName, patterns))
			result.add(contigName);
	}
	return result;
}
 
Example 29
@Test(dataProvider = "fastas")
public void testCachingIndexedFastaReaderSequential1(Path fasta, Path unzipped, int cacheSize, int querySize) throws IOException {
    try(final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false)) {

        SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
        logger.debug(String.format("Checking contig %s length %d with cache size %d and query size %d",
                                  contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));
        testSequential(caching, unzipped, querySize);
    }
}
 
Example 30
Source Project: Drop-seq   Source File: FilterBam.java    License: MIT License 5 votes vote down vote up
private SAMSequenceRecord cloneWithNewName(final SAMSequenceRecord sequence, final String editedSequenceName) {
     final SAMSequenceRecord ret = new SAMSequenceRecord(editedSequenceName, sequence.getSequenceLength());
     for (Map.Entry<String, String> entry : sequence.getAttributes())
if (entry.getKey().equals(SAMSequenceRecord.SEQUENCE_NAME_TAG))
	ret.setAttribute(SAMSequenceRecord.SEQUENCE_NAME_TAG, editedSequenceName);
else
	ret.setAttribute(entry.getKey(), entry.getValue());
     return ret;
 }