Java Code Examples for htsjdk.samtools.SAMSequenceRecord#getSequenceLength()

The following examples show how to use htsjdk.samtools.SAMSequenceRecord#getSequenceLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: RandomDNA.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Creates a random reference and writes it in FASTA format into a {@link Writer}.
 * @param out the output writer.
 * @param dict the dictionary indicating the number of contigs and their lengths.
 * @param basesPerLine number of base to print in each line of the output FASTA file.
 *
 * @throws IOException if such an exception was thrown while accessing and writing into the temporal file.
 * @throws IllegalArgumentException if {@code dict} is {@code null}, or {@code out } is {@code null}
 *    or {@code basesPerLine} is 0 or negative.
 */
public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final int basesPerLine)
        throws IOException {
    Utils.nonNull(out);
    Utils.nonNull(dict);
    ParamUtils.isPositive(basesPerLine, "number of base per line must be strictly positive: " + basesPerLine);
    final byte[] buffer = new byte[basesPerLine];
    final String lineSeparator = System.lineSeparator();
    for (final SAMSequenceRecord sequence : dict.getSequences()) {
        int pendingBases = sequence.getSequenceLength();
        out.append(">").append(sequence.getSequenceName()).append(lineSeparator);
        while (pendingBases > 0) {
            final int lineLength = pendingBases < basesPerLine ? pendingBases : basesPerLine;
            nextBases(buffer, 0, lineLength);
            out.append(new String(buffer, 0, lineLength)).append(lineSeparator);
            pendingBases -= lineLength;
        }
    }
}

Example 2

Source File: CachingIndexedFastaSequenceFileUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test
public void testMixedCasesInExample() throws IOException {
    try(final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
        final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(IOUtils.getPath(exampleFASTA), true);
        final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(IOUtils.getPath(exampleFASTA), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
    ) {

        int nMixedCase = 0;
        for (SAMSequenceRecord contig : original.getSequenceDictionary().getSequences()) {
            nMixedCase += mixedCasesTestHelper(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);

            final int step = 100;
            for (int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step) {
                mixedCasesTestHelper(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
            }
        }


        Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file.  Unexpected test state");
    }
}

Example 3

Source File: SamRangeUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

6 votes

/**
 * Resolves an inital range (supplied by the user, and may have unbounded ends) to the available sequences.
 * If end is greater than number of sequences it sets end to number of sequences.
 * @param range the range
 * @param dictionary the dictionary with which to validate/resolve the range
 * @return the resolved range.
 * @throws NoTalkbackSlimException if the start is out of range.
 */
public static SequenceNameLocus resolveRestriction(SAMSequenceDictionary dictionary, SequenceNameLocus range) {
  final SAMSequenceRecord sequence = dictionary.getSequence(range.getSequenceName());
  if (sequence == null) {
    throw new NoTalkbackSlimException("Sequence \"" + range.getSequenceName() + "\" referenced in region was not found in the SAM sequence dictionary.");
  }
  final int start = range.getStart() == SamRegionRestriction.MISSING ? 0 : range.getStart();
  final int length = sequence.getSequenceLength();
  if (start > length || (length != 0 && start == length)) {  // Allow start == 0 if empty sequence
    throw new NoTalkbackSlimException("The start position \"" + start + "\" must be less than than length of the sequence \"" + length + "\".");
  }
  int end = range.getEnd() == LongRange.MISSING ? length : range.getEnd();
  if (end > length) {
    Diagnostic.warning("The end position \"" + range.getEnd() + "\" is outside the length of the sequence (" + length
      + "). Defaulting end to \"" + length + "\"");
    end = length;
  }
  return new SequenceNameLocusSimple(range.getSequenceName(), start, end);
}

Example 4

Source File: SamRangeUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

6 votes

static <T> void validateRanges(SAMFileHeader header, ReferenceRanges<T> rangeMap) {
  for (final String seq : rangeMap.sequenceNames()) {
    final SAMSequenceRecord r  = header.getSequenceDictionary().getSequence(seq);
    if (r == null) {
      throw new NoTalkbackSlimException("Sequence \"" + seq + "\" referenced in regions not found in the SAM sequence dictionary.");
    }

    if (r.getSequenceLength() > 0) {
      final RangeList<T> rs = rangeMap.get(seq);
      if (rs != null) {
        final List<? extends Interval> ranges = rs.getRangeList();
        final Interval last = ranges.get(ranges.size() - 1);
        if (last.getEnd() >  r.getSequenceLength()) {
          throw new NoTalkbackSlimException("Specified sequence range (" + r.getSequenceName() + ":" + last + ") is outside the length of the sequence (" + r.getSequenceLength() + ")");
        }
      }
    }
  }
}

Example 5

Source File: SamUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

6 votes

/**
 * Method to check the equivalence of two SAM headers
 * @param fh a <code>SAMFileHeader</code> value
 * @param lh a <code>SAMFileHeader</code> value
 * @return true if the headers are compatible.
 */
public static boolean checkHeaderDictionary(final SAMFileHeader fh, final SAMFileHeader lh) {
  if (fh.getSortOrder() != lh.getSortOrder()) {
    return false;
  }
  final List<SAMSequenceRecord> flist = fh.getSequenceDictionary().getSequences();
  final List<SAMSequenceRecord> llist = lh.getSequenceDictionary().getSequences();
  final Iterator<SAMSequenceRecord> fi = flist.iterator();
  final Iterator<SAMSequenceRecord> li = llist.iterator();
  while (fi.hasNext()) {
    if (!li.hasNext()) {
      return false;
    }
    final SAMSequenceRecord fsr = fi.next();
    final SAMSequenceRecord lsr = li.next();
    if (!fsr.getSequenceName().equals(lsr.getSequenceName()) || fsr.getSequenceLength() != lsr.getSequenceLength()) {
      return false;
    }
  }
  if (li.hasNext()) {
    return false;
  }
  return true;
}

Example 6

Source File: PicardIndexedFastaSequenceFile.java From chipster with MIT License

6 votes

/**
 * Do some basic checking to make sure the dictionary and the index match.
 * @param fastaFile Used for error reporting only.
 * @param sequenceDictionary sequence dictionary to check against the index.
 * @param index index file to check against the dictionary.
 */
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
                                                        final SAMSequenceDictionary sequenceDictionary,
                                                        final FastaSequenceIndex index) {
    // Make sure dictionary and index are the same size.
    if( sequenceDictionary.getSequences().size() != index.size() )
        throw new SAMException("Sequence dictionary and index contain different numbers of contigs");

    Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
    Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();

    while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
        SAMSequenceRecord sequenceEntry = sequenceIterator.next();
        FastaSequenceIndexEntry indexEntry = indexIterator.next();

        if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
            throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
                    fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
        }

        // Make sure sequence length matches index length.
        if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
            throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
    }
}

Example 7

Source File: FindBadGenomicKmersSparkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(groups = "sv")
public void miniRefTest() throws IOException {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final ReferenceMultiSparkSource ref = new ReferenceMultiSparkSource(
            REFERENCE_FILE_NAME, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(null);
    if ( dict == null ) throw new GATKException("No reference dictionary available.");

    final Map<SVKmer, Long> kmerMap = new LinkedHashMap<>();
    for ( final SAMSequenceRecord rec : dict.getSequences() ) {
        final SimpleInterval interval = new SimpleInterval(rec.getSequenceName(), 1, rec.getSequenceLength());
        final byte[] bases = ref.getReferenceBases(interval).getBases();
        SVKmerizer.canonicalStream(bases, KMER_SIZE, new SVKmerLong())
                .forEach(kmer -> kmerMap.put(kmer, kmerMap.getOrDefault(kmer, 0L) + 1));
    }
    kmerMap.entrySet().removeIf( x -> x.getValue() <= FindBadGenomicKmersSpark.MAX_KMER_FREQ);

    final List<SVKmer> badKmers =
            FindBadGenomicKmersSpark.findBadGenomicKmers(ctx, KMER_SIZE, Integer.MAX_VALUE, ref, null);
    final Set<SVKmer> badKmerSet = new HashSet<>(badKmers);
    Assert.assertEquals(badKmers.size(), badKmerSet.size());
    Assert.assertEquals(badKmerSet, kmerMap.keySet());
}

Example 8

Source File: SamRangeUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

5 votes

/**
 * Make a reference range list corresponding to the full length of all reference sequences
 * @param header the SAM header containing sequence information
 * @return the ReferenceRanges lookup
 */
public static ReferenceRanges<String> createFullReferenceRanges(SAMFileHeader header) {
  final ReferenceRanges<String> rangeMap = new ReferenceRanges<>(true);
  for (final SAMSequenceRecord r : header.getSequenceDictionary().getSequences()) {
    final int rlen = r.getSequenceLength();
    if (rlen > 0) {
      rangeMap.put(r.getSequenceName(), new RangeList<>(new RangeList.RangeData<>(0, rlen, r.getSequenceName())));
    }
  }
  rangeMap.setIdMap(SamUtils.getSequenceIdLookup(header.getSequenceDictionary()));
  return rangeMap;
}

Example 9

Source File: IntervalTagComparatorTest.java From Drop-seq with MIT License

5 votes

private List<SAMRecord> createManyIntervalTaggedSAMRecords (final int desiredNumRecords) {
	List<SAMRecord> data = new ArrayList<>();

	SamReader inputSam = SamReaderFactory.makeDefault().open(this.dictFile);
	SAMRecord samRecordTemplate = new SAMRecord (inputSam.getFileHeader());

	SAMSequenceDictionary dict= inputSam.getFileHeader().getSequenceDictionary();
	List<SAMSequenceRecord> recs = dict.getSequences();
	int numRecs = recs.size();

	Random randomGenerator = new Random();
	for (int i=0; i<desiredNumRecords; i++) {
		SAMSequenceRecord r = recs.get(randomGenerator.nextInt(numRecs+1));
		String chr = r.getSequenceName();
		int seqLen = r.getSequenceLength();
		int s1 = randomGenerator.nextInt(seqLen);
		int s2 = randomGenerator.nextInt(seqLen);
		int s = Math.min(s1, s2);
		int e = Math.max(s1, s2);
		Interval interval = new Interval (chr, s1,s2);
		try {
			SAMRecord r1 = (SAMRecord) samRecordTemplate.clone();
			// I realize that using encoding the full interval can be a bit heavy handed.
			r1.setAttribute(this.intervalTag, interval.toString());
			data.add(r1);
		} catch (CloneNotSupportedException e1) {
			// this should never happen, sigh.
		}
	}
	return data;
}

Example 10

Source File: IntervalTagComparatorTest.java From Drop-seq with MIT License

5 votes

/**
 * Make this a little more like actual sequence data, where reads mapped to GL are practically non existent.
 * @param dict
 * @return
 */
private SAMSequenceDictionary filterSD (final SAMSequenceDictionary dict) {
	SAMSequenceDictionary result = new SAMSequenceDictionary();
	for (SAMSequenceRecord r: dict.getSequences())
		if (r.getSequenceLength()>10000000)
			result.addSequence(r);
	return result;
}

Example 11

Source File: VarDictLauncher.java From VarDictJava with MIT License

5 votes

/**
 * Read map of chromosome lengths
 * @param bam BAM file name
 * @return Map of chromosome lengths. Key - chromosome name, value - length
 * @throws IOException if BAM/SAM file can't be opened
 */
public static Map<String, Integer> readChr(String bam) throws IOException {
    try (SamReader reader = SamReaderFactory.makeDefault().open(new File(bam))) {
        SAMFileHeader header = reader.getFileHeader();
        Map<String, Integer> chrs = new HashMap<>();
        for (SAMSequenceRecord record : header.getSequenceDictionary().getSequences()) {
            record.getSequenceLength();
            String sn = record.getSequenceName();
            int ln = record.getSequenceLength();
            chrs.put(sn, ln);
        }
        return chrs;
    }
}

Example 12

Source File: IntervalUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Determines whether the provided interval is within the bounds of its assigned contig according to the provided dictionary
 *
 * @param interval interval to check
 * @param dictionary dictionary to use to validate contig bounds
 * @return true if the interval's contig exists in the dictionary, and the interval is within its bounds, otherwise false
 */
public static boolean intervalIsOnDictionaryContig( final SimpleInterval interval, final SAMSequenceDictionary dictionary ) {
    Utils.nonNull(interval);
    Utils.nonNull(dictionary);

    final SAMSequenceRecord contigRecord = dictionary.getSequence(interval.getContig());
    if ( contigRecord == null ) {
        return false;
    }

    return interval.getEnd() <= contigRecord.getSequenceLength();
}

Example 13

Source File: BamOverlapChecker.java From systemsgenetics with GNU General Public License v3.0

5 votes

public BamOverlapChecker(SamReader bam_file){
    
    SAMFileHeader  header = bam_file.getFileHeader();
    SAMSequenceDictionary dict = header.getSequenceDictionary();
    List<SAMSequenceRecord> sequences = dict.getSequences();
   
    
    booleanMap = new HashMap<String, boolean[]>();
    
    for(SAMSequenceRecord sequence : sequences){
        int sequenceEnd = sequence.getSequenceLength();
        int arrayLength = (int) Math.ceil( (float) sequenceEnd / (float)stepSize );
        boolean[] tempArray;
        tempArray = new boolean[arrayLength];
        
        for(int i=0;i<arrayLength;i++){
            SAMRecordIterator bamQuery = bam_file.queryOverlapping(sequence.getSequenceName(), i*stepSize, (i+1)*stepSize);
            if(bamQuery.hasNext()){
                tempArray[i] = true;
            }else{
                tempArray[i] = false;
            }
            bamQuery.close();
        }
        booleanMap.put(sequence.getSequenceName(),tempArray );
        //System.out.println("Finished checking the bam for chromosome " + sequence.getSequenceName());
    }
    
}

Example 14

Source File: SequenceDictionaryUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Trivial helper that returns true if elt has the same name and length as rec1 or rec2
 * @param elt record to test
 * @param recs the list of records to check for name and length equivalence
 * @return true if elt has the same name and length as any of the recs
 */
private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) {
    for (SAMSequenceRecord rec : recs) {
        if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) {
            return true;
        }
    }
    return false;
}

Example 15

Source File: GenomeLocParser.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * validate a position or interval on the genome as valid
 *
 * Requires that contig exist in the master sequence dictionary, and that contig index be valid as well.  Requires
 * that start <= stop.
 *
 * if mustBeOnReference is true,
 * performs boundary validation for genome loc INTERVALS:
 * start and stop are on contig and start <= stop
 *
 * @param contig the contig name
 * @param start  the start position
 * @param stop   the stop position
 *
 * @return the interned contig name, an optimization that ensures that contig == the string in the sequence dictionary
 */
protected String validateGenomeLoc(final String contig, final int contigIndex, final int start, final int stop, final boolean mustBeOnReference) {
    if ( validationLevel == ValidationLevel.NONE )
        return contig;
    else {
        if (stop < start)
            vglHelper(String.format("The stop position %d is less than start %d in contig %s", stop, start, contig));

        final SAMSequenceRecord contigInfo = this.contigInfo.getSequence(contig);
        if ( contigInfo.getSequenceIndex() != contigIndex )
            vglHelper(String.format("The contig index %d is bad, doesn't equal the contig index %d of the contig from a string %s",
                    contigIndex, contigInfo.getSequenceIndex(), contig));

        if ( mustBeOnReference ) {
            if (start < 1)
                vglHelper(String.format("The start position %d is less than 1", start));

            if (stop < 1)
                vglHelper(String.format("The stop position %d is less than 1", stop));

            final int contigSize = contigInfo.getSequenceLength();
            if (contigSize == SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH) {
                logger.warn(String.format("The available sequence dictionary does not contain a sequence length for contig (%s). " +
                        "Skipping validation of the genome loc end coordinate (%d).",
                        contig, stop));
            }
            else if (start > contigSize || stop > contigSize) {
                vglHelper(String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize));
            }
        }

        return contigInfo.getSequenceName();
    }
}

Example 16

Source File: CachingIndexedFastaSequenceFileUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "fastas")
public void testCachingIndexedFastaReaderTwoStage(Path fasta, Path unzipped, int cacheSize, int querySize) throws IOException {
    try(final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.debug(String.format(
                "Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for (int i = 0; i < contig.getSequenceLength(); i += 10) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart,
                                                                        middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }
}

Example 17

Source File: UpdateVCFSequenceDictionary.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) {
    // Validate each variant against the source dictionary manually
    SAMSequenceRecord samSeqRec = sourceDictionary.getSequence(vc.getContig());
    if (samSeqRec == null) {
        throw new CommandLineException.BadArgumentValue(
            String.format(
                "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
                "that is not present in the provided dictionary",
                vc.getID(),
                vc.getContig()
            )
        );
    } else if (vc.getEnd() > samSeqRec.getSequenceLength()) {
        throw new CommandLineException.BadArgumentValue(
            String.format(
                "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
                "that ends at a position (%d) that exceeds the length of that sequence (%d) in the provided dictionary",
                vc.getID(),
                vc.getContig(),
                vc.getEnd(),
                samSeqRec.getSequenceLength()
            )
        );
    }
    vcfWriter.add(vc);
}

Example 18

Source File: FilterBam.java From Drop-seq with MIT License

5 votes

private SAMSequenceRecord cloneWithNewName(final SAMSequenceRecord sequence, final String editedSequenceName) {
     final SAMSequenceRecord ret = new SAMSequenceRecord(editedSequenceName, sequence.getSequenceLength());
     for (Map.Entry<String, String> entry : sequence.getAttributes())
if (entry.getKey().equals(SAMSequenceRecord.SEQUENCE_NAME_TAG))
	ret.setAttribute(SAMSequenceRecord.SEQUENCE_NAME_TAG, editedSequenceName);
else
	ret.setAttribute(entry.getKey(), entry.getValue());
     return ret;
 }

Example 19

Source File: SAMFileHeader_Utils.java From cramtools with Apache License 2.0

4 votes

static SAMFileHeader readHeader(final BinaryCodec stream, final ValidationStringency validationStringency,
		final String source) throws IOException {

	final byte[] buffer = new byte[4];
	stream.readBytes(buffer);
	if (!Arrays.equals(buffer, "BAM\1".getBytes())) {
		throw new IOException("Invalid BAM file header");
	}

	final int headerTextLength = stream.readInt();
	final String textHeader = stream.readString(headerTextLength);
	final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
	headerCodec.setValidationStringency(validationStringency);
	final SAMFileHeader samFileHeader = headerCodec.decode(new StringLineReader(textHeader), source);

	final int sequenceCount = stream.readInt();
	if (samFileHeader.getSequenceDictionary().size() > 0) {
		// It is allowed to have binary sequences but no text sequences, so
		// only validate if both are present
		if (sequenceCount != samFileHeader.getSequenceDictionary().size()) {
			throw new SAMFormatException("Number of sequences in text header ("
					+ samFileHeader.getSequenceDictionary().size() + ") != number of sequences in binary header ("
					+ sequenceCount + ") for file " + source);
		}
		for (int i = 0; i < sequenceCount; i++) {
			final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(stream, source);
			final SAMSequenceRecord sequenceRecord = samFileHeader.getSequence(i);
			if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
				throw new SAMFormatException("For sequence " + i
						+ ", text and binary have different names in file " + source);
			}
			if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
				throw new SAMFormatException("For sequence " + i
						+ ", text and binary have different lengths in file " + source);
			}
		}
	} else {
		// If only binary sequences are present, copy them into
		// samFileHeader
		final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
		for (int i = 0; i < sequenceCount; i++) {
			sequences.add(readSequenceRecord(stream, source));
		}
		samFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
	}

	return samFileHeader;
}

Example 20

Source File: BedToIntervalList.java From picard with MIT License

4 votes

@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
    IOUtil.assertFileIsWritable(OUTPUT);
    try {
        // create a new header that we will assign the dictionary provided by the SAMSequenceDictionaryExtractor to.
        final SAMFileHeader header = new SAMFileHeader();
        final SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary(SEQUENCE_DICTIONARY.toPath());
        header.setSequenceDictionary(samSequenceDictionary);
        // set the sort order to be sorted by coordinate, which is actually done below
        // by getting the .uniqued() intervals list before we write out the file
        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        final IntervalList intervalList = new IntervalList(header);

        final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(), false);
        final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
        final ProgressLogger progressLogger = new ProgressLogger(LOG, (int) 1e6);

        while (iterator.hasNext()) {
            final BEDFeature bedFeature = iterator.next();
            final String sequenceName = bedFeature.getContig();
            final int start = bedFeature.getStart();
            final int end = bedFeature.getEnd();
            // NB: do not use an empty name within an interval
            final String name;
            if (bedFeature.getName().isEmpty()) {
                name = null;
            } else {
                name = bedFeature.getName();
            }

            final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);

            // Do some validation
            if (null == sequenceRecord) {
                if (DROP_MISSING_CONTIGS) {
                    LOG.info(String.format("Dropping interval with missing contig: %s:%d-%d", sequenceName, bedFeature.getStart(), bedFeature.getEnd()));
                    missingIntervals++;
                    missingRegion += bedFeature.getEnd() - bedFeature.getStart();
                    continue;
                }
                throw new PicardException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
            } else if (start < 1) {
                throw new PicardException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
            } else if (sequenceRecord.getSequenceLength() < start) {
                throw new PicardException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
            } else if ((end == 0 && start != 1 ) //special case for 0-length interval at the start of a contig
                    || end < 0 ) {
                throw new PicardException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
            } else if (sequenceRecord.getSequenceLength() < end) {
                throw new PicardException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
            } else if (end < start - 1) {
                throw new PicardException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
            }

            final boolean isNegativeStrand = bedFeature.getStrand() == Strand.NEGATIVE;
            final Interval interval = new Interval(sequenceName, start, end, isNegativeStrand, name);
            intervalList.add(interval);

            progressLogger.record(sequenceName, start);
        }
        CloserUtil.close(bedReader);

        if (DROP_MISSING_CONTIGS) {
            if (missingRegion == 0) {
                LOG.info("There were no missing regions.");
            } else {
                LOG.warn(String.format("There were %d missing regions with a total of %d bases", missingIntervals, missingRegion));
            }
        }
        // Sort and write the output
        IntervalList out = intervalList;
        if (SORT) {
            out = out.sorted();
        }
        if (UNIQUE) {
            out = out.uniqued();
        }
        out.write(OUTPUT);
        LOG.info(String.format("Wrote %d intervals spanning a total of %d bases", out.getIntervals().size(),out.getBaseCount()));

    } catch (final IOException e) {
        throw new RuntimeException(e);
    }

    return 0;
}