htsjdk.samtools.seekablestream.SeekableStream Java Examples

The following examples show how to use htsjdk.samtools.seekablestream.SeekableStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IndexAggregate.java    From cramtools with Apache License 2.0 6 votes vote down vote up
public static IndexAggregate forDataFile(SeekableStream stream, SAMSequenceDictionary dictionary)
		throws IOException {
	String path = stream.getSource();
	File indexFile = findIndexFileFor(path);
	if (indexFile == null)
		throw new FileNotFoundException("No index found for file: " + path);

	log.info("Using index file: " + indexFile.getAbsolutePath());
	IndexAggregate a = new IndexAggregate();
	if (indexFile.getName().matches("(?i).*\\.bai")) {
		a.bai = new CachingBAMFileIndex(indexFile, dictionary);
		return a;
	}
	if (indexFile.getName().matches("(?i).*\\.crai")) {
		a.crai = CramIndex.readIndex(new GZIPInputStream(new FileInputStream(indexFile)));
		return a;
	}

	throw new FileNotFoundException("No index found for file: " + path);
}
 
Example #2
Source File: BAMRecordReader.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
private SamReader createSamReader(SeekableStream in, SeekableStream inIndex,
		ValidationStringency stringency, boolean useIntelInflater) {
	SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
			.setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true)
			.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
			.setUseAsyncIo(false);
	if (stringency != null) {
		readerFactory.validationStringency(stringency);
	}
	SamInputResource resource = SamInputResource.of(in);
	if (inIndex != null) {
		resource.index(inIndex);
	}
	if (useIntelInflater) {
		readerFactory.inflaterFactory(IntelGKLAccessor.newInflatorFactor());
	}
	return readerFactory.open(resource);
}
 
Example #3
Source File: CRAMInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
private static List<Long> getContainerOffsets(Configuration conf, Path cramFile)
    throws IOException {
  SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile);
  CramContainerIterator cci = new CramContainerIterator(seekableStream);
  List<Long> containerOffsets = new ArrayList<Long>();
  containerOffsets.add(seekableStream.position());
  while (cci.hasNext()) {
    cci.next();
    containerOffsets.add(seekableStream.position());
  }
  containerOffsets.add(seekableStream.length());
  return containerOffsets;
}
 
Example #4
Source File: Utils.java    From cramtools with Apache License 2.0 5 votes vote down vote up
private static boolean checkEOF(htsjdk.samtools.cram.common.Version version, SeekableStream seekableStream)
		throws IOException {
	if (version.compatibleWith(CramVersions.CRAM_v3))
		return streamEndsWith(seekableStream, CramIO.ZERO_F_EOF_MARKER);
	if (version.compatibleWith(CramVersions.CRAM_v2_1))
		return streamEndsWith(seekableStream, CramIO.ZERO_B_EOF_MARKER);

	return false;
}
 
Example #5
Source File: Utils.java    From cramtools with Apache License 2.0 5 votes vote down vote up
private static boolean streamEndsWith(SeekableStream seekableStream, byte[] marker) throws IOException {
	byte[] tail = new byte[marker.length];
	seekableStream.seek(seekableStream.length() - marker.length);
	InputStreamUtils.readFully(seekableStream, tail, 0, tail.length);
	if (Arrays.equals(tail, marker))
		return true;
	tail[8] = (byte) (tail[8] | 240);
	return Arrays.equals(tail, marker);
}
 
Example #6
Source File: IndexAggregate.java    From cramtools with Apache License 2.0 5 votes vote down vote up
private static long seek(BAMIndex index, int seqId, int start, int end, SeekableStream cramStream)
		throws IOException {
	BAMFileSpan span = index.getSpanOverlapping(seqId, start, end);
	if (span == null)
		return -1;
	long[] coords = span.toCoordinateArray();
	if (coords.length == 0)
		return -1;
	long[] offsets = new long[coords.length / 2];
	for (int i = 0; i < offsets.length; i++) {
		offsets[i] = coords[i * 2] >> 16;
	}
	Arrays.sort(offsets);

	// peek into container in offset ascending order and choose the first
	// that intersects the query:
	for (int i = 0; i < offsets.length; i++) {
		log.debug("Peeking at offset: " + offsets[i]);
		IndexAggregate.ContainerBoundary b = peek(cramStream, offsets[i]);
		if (b == null)
			continue;

		boolean intersects = intersects(start, end, b);
		// System.out.printf("%b, %d, %d, %d, %d\n", intersects, start, end,
		// b.start, b.start + b.span);

		if (intersects(start, end, b)) {
			long offset = offsets[i];
			log.debug("Found query at offset: " + offset);
			cramStream.seek(offset);
			return offset;
		}

	}
	return -1;
}
 
Example #7
Source File: IndexAggregate.java    From cramtools with Apache License 2.0 5 votes vote down vote up
private static long seek(List<CramIndex.Entry> index, int seqId, int start, int end, SeekableStream cramStream)
		throws IOException {
	List<Entry> found = CramIndex.find(index, seqId, start, end - start + 1);
	if (found == null || found.size() == 0)
		return -1;
	cramStream.seek(found.get(0).containerStartOffset);
	log.debug("Found query at offset: " + found.get(0).containerStartOffset);
	return found.get(0).containerStartOffset;
}
 
Example #8
Source File: VariantsSparkSinkUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private VCFHeader getHeader(String vcf) throws IOException {
    final java.nio.file.Path vcfPath = IOUtils.getPath(vcf);
    try (SeekableStream stream = new SeekablePathStream(vcfPath)) {
        return VCFHeaderReader.readHeaderFrom(stream);
    } catch (IOException e) {
        throw new UserException("Failed to read VCF header from " + vcf + "\n Caused by:" + e.getMessage(), e);
    }
}
 
Example #9
Source File: CreateHadoopBamSplittingIndex.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
    assertIsBam(inputBam);
    try(SeekableStream in = new SeekableFileStream(inputBam);
        BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
            BAMSBIIndexer.createIndex(in, out, granularity);
    } catch (final IOException e) {
        throw new UserException("Couldn't create splitting index", e);
    }
}
 
Example #10
Source File: SeekingBAMFileReader.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
public SeekingBAMFileReader(final SamInputResource resource,
    final boolean eagerDecode,
    final ValidationStringency validationStringency,
    final SAMRecordFactory factory,
    long offset)
        throws IOException {
  super(resource.data().asUnbufferedSeekableStream(), (SeekableStream)null, 
      eagerDecode, validationStringency, factory);
  this.offset = offset;
  this.stream = resource.data().asUnbufferedSeekableStream();
}
 
Example #11
Source File: BAMIO.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
private static SamInputResource openBAMFile(Storage.Objects storageClient, String gcsStoragePath, SeekableStream index) throws IOException {
  SeekableGCSStream s = new SeekableGCSStream(storageClient, gcsStoragePath);
  SamInputResource samInputResource =
      SamInputResource.of(s);

  if (index != null) {
    samInputResource.index(index);
  }

  LOG.info("getReadsFromBAMFile - got input resources");
  return samInputResource;
}
 
Example #12
Source File: BAMIO.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
private static SeekableStream openIndexForPath(Storage.Objects storageClient,String gcsStoragePath) {
  final String indexPath = gcsStoragePath + ".bai";
  try {
    return new SeekableGCSStream(storageClient, indexPath);
  } catch (IOException ex) {
    LOG.info("No index for " + indexPath);
    // Ignore if there is no bai file
  }
  return null;
}
 
Example #13
Source File: TestCRAMOutputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
private int getCRAMRecordCount(
    final File containerStreamFile,
    final SAMFileHeader header,
    final ReferenceSource refSource) throws IOException
{
    // assemble a proper CRAM file from the container stream shard(s) in
    // order to verify the contents
    final ByteArrayInputStream mergedStream = mergeCRAMContainerStream (
        containerStreamFile,
        header,
        refSource
    );

    // now we can verify that we can read everything back in
    final CRAMFileReader resultCRAMReader = new CRAMFileReader(
        mergedStream,
        (SeekableStream) null,
        refSource,
        ValidationStringency.DEFAULT_STRINGENCY);
    final Iterator<SAMRecord> it = resultCRAMReader.getIterator();
    int actualCount = 0;
    while (it.hasNext()) {
        it.next();
        actualCount++;
    }
    return actualCount;
}
 
Example #14
Source File: TestBAMSplitGuesser.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  String bam = getClass().getClassLoader().getResource("test.bam").getFile();
  SeekableStream ss = WrapSeekable.openPath(conf, new Path(bam));
  BAMSplitGuesser bamSplitGuesser = new BAMSplitGuesser(ss, conf);
  long startGuess = bamSplitGuesser.guessNextBAMRecordStart(0, 3 * 0xffff + 0xfffe);
  assertEquals(SAMUtils.findVirtualOffsetOfFirstRecordInBam(new File(bam)), startGuess);
}
 
Example #15
Source File: BAMSplitGuesser.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public BAMSplitGuesser(
		SeekableStream ss, InputStream headerStream, Configuration conf)
	throws IOException
{
	inFile = ss;

	header = SAMHeaderReader.readSAMHeaderFrom(headerStream, conf);
	referenceSequenceCount = header.getSequenceDictionary().size();

	bamCodec = new BAMRecordCodec(null, new LazyBAMRecordFactory());
}
 
Example #16
Source File: BAMSplitGuesser.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
/** The stream must point to a valid BAM file, because the header is read
 * from it.
 */
public BAMSplitGuesser(
		SeekableStream ss, Configuration conf)
	throws IOException
{
	this(ss, ss, conf);

	// Secondary check that the header points to a BAM file: Picard can get
	// things wrong due to its autodetection.
	ss.seek(0);
	if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC)
		throw new SAMFormatException("Does not seem like a BAM file");
}
 
Example #17
Source File: BamSlicerApplication.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
private static void sliceFromURLs(@NotNull URL indexUrl, @NotNull URL bamUrl, @NotNull CommandLine cmd) throws IOException {
    File indexFile = downloadIndex(indexUrl);
    indexFile.deleteOnExit();

    SamReader reader = createFromCommandLine(cmd).open(SamInputResource.of(bamUrl).index(indexFile));

    BAMIndex bamIndex;
    if (indexFile.getPath().contains(".crai")) {
        SeekableStream craiIndex = CRAIIndex.openCraiFileAsBaiStream(indexFile, reader.getFileHeader().getSequenceDictionary());
        bamIndex = new DiskBasedBAMFileIndex(craiIndex, reader.getFileHeader().getSequenceDictionary());
    } else {
        bamIndex = new DiskBasedBAMFileIndex(indexFile, reader.getFileHeader().getSequenceDictionary(), false);
    }

    Optional<Pair<QueryInterval[], BAMFileSpan>> queryIntervalsAndSpan = queryIntervalsAndSpan(reader, bamIndex, cmd);
    Optional<Chunk> unmappedChunk = getUnmappedChunk(bamIndex, HttpUtils.getHeaderField(bamUrl, "Content-Length"), cmd);
    List<Chunk> sliceChunks = sliceChunks(queryIntervalsAndSpan, unmappedChunk);
    SamReader cachingReader = createCachingReader(indexFile, bamUrl, cmd, sliceChunks);

    SAMFileWriter writer = new SAMFileWriterFactory().setCreateIndex(true)
            .makeBAMWriter(reader.getFileHeader(), true, new File(cmd.getOptionValue(OUTPUT)));

    queryIntervalsAndSpan.ifPresent(pair -> {
        LOGGER.info("Slicing bam on bed regions...");
        CloseableIterator<SAMRecord> bedIterator = getIterator(cachingReader, pair.getKey(), pair.getValue().toCoordinateArray());
        writeToSlice(writer, bedIterator);
        LOGGER.info("Done writing bed slices.");
    });

    unmappedChunk.ifPresent(chunk -> {
        LOGGER.info("Slicing unmapped reads...");
        CloseableIterator<SAMRecord> unmappedIterator = cachingReader.queryUnmapped();
        writeToSlice(writer, unmappedIterator);
        LOGGER.info("Done writing unmapped reads.");
    });

    reader.close();
    writer.close();
    cachingReader.close();
}
 
Example #18
Source File: TestVCFHeaderReader.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
static SeekableStream seekableStream(final String resource) throws IOException {
  return new ByteArraySeekableStream(Resources.toByteArray(ClassLoader.getSystemClassLoader().getResource(resource)));
}
 
Example #19
Source File: LinearBAMIndex.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
public LinearBAMIndex(SeekableStream stream, SAMSequenceDictionary dict) {
        super(stream, dict);
}
 
Example #20
Source File: BAMFileIndexImpl.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
public BAMFileIndexImpl(SeekableStream stream, SAMSequenceDictionary dict) {
  super(stream, dict);
  mBamDictionary = dict;
}
 
Example #21
Source File: BAMRecordReader.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx)
           throws IOException
{
	// This method should only be called once (see Hadoop API). However,
	// there seems to be disagreement between implementations that call
	// initialize() and Hadoop-BAM's own code that relies on
	// {@link BAMInputFormat} to call initialize() when the reader is
	// created. Therefore we add this check for the time being. 
	if(isInitialized)
		close();
	isInitialized = true;
	reachedEnd = false;

	final Configuration conf = ctx.getConfiguration();

	final FileVirtualSplit split = (FileVirtualSplit)spl;
	final Path             file  = split.getPath();
	final FileSystem       fs    = file.getFileSystem(conf);

	ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
	boolean useIntelInflater = BAMInputFormat.useIntelInflater(conf);

	java.nio.file.Path index = SamFiles.findIndex(NIOFileUtil.asPath(fs.makeQualified(file).toUri()));
	Path fileIndex = index == null ? null : new Path(index.toUri());
	SeekableStream indexStream = fileIndex == null ? null : WrapSeekable.openPath(fs, fileIndex);
	in = WrapSeekable.openPath(fs, file);
	SamReader samReader = createSamReader(in, indexStream, stringency, useIntelInflater);
	final SAMFileHeader header = samReader.getFileHeader();

	long virtualStart = split.getStartVirtualOffset();

	fileStart  = virtualStart >>> 16;
	virtualEnd = split.getEndVirtualOffset();

	SamReader.PrimitiveSamReader primitiveSamReader =
			((SamReader.PrimitiveSamReaderToSamReaderAdapter) samReader).underlyingReader();
	bamFileReader = (BAMFileReader) primitiveSamReader;

	if (logger.isDebugEnabled()) {
		final long recordStart = virtualStart & 0xffff;
		logger.debug("Initialized BAMRecordReader; byte offset: {}, record offset: {}",
			fileStart, recordStart);
	}

	if (conf.getBoolean("hadoopbam.bam.keep-paired-reads-together", false)) {
		throw new IllegalArgumentException("Property hadoopbam.bam.keep-paired-reads-together is no longer honored.");
	}

	boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf);
	if (boundedTraversal && split.getIntervalFilePointers() != null) {
		// return reads for intervals
		List<Interval> intervals = BAMInputFormat.getIntervals(conf);
		QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary());
		iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers());
	} else if (boundedTraversal && split.getIntervalFilePointers() == null) {
		// return unmapped reads
		iterator = bamFileReader.queryUnmapped();
	} else {
		// return everything
		BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(virtualStart, virtualEnd));
		iterator = bamFileReader.getIterator(splitSpan);
	}
}
 
Example #22
Source File: BCFSplitGuesser.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
private void cinSeek(long virt) throws IOException {
	if (bgzf)
		((BlockCompressedInputStream)cin).seek(virt);
	else
		((SeekableStream)cin).seek(virt);
}
 
Example #23
Source File: BCFSplitGuesser.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
/** The stream must point to a valid BCF file, because the header is read
 * from it.
 */
public BCFSplitGuesser(SeekableStream ss) throws IOException {
	this(ss, ss);
}
 
Example #24
Source File: IndexAggregate.java    From cramtools with Apache License 2.0 4 votes vote down vote up
public static IndexAggregate fromBaiFile(SeekableStream baiStream, SAMSequenceDictionary dictionary)
		throws IOException {
	IndexAggregate a = new IndexAggregate();
	a.bai = new CachingBAMFileIndex(baiStream, dictionary);
	return a;
}
 
Example #25
Source File: BAMInputFormat.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
private int addProbabilisticSplits(
		List<InputSplit> splits, int i, List<InputSplit> newSplits,
		Configuration cfg)
	throws IOException
{
	final Path path = ((FileSplit)splits.get(i)).getPath();
       try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {

           final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

           FileVirtualSplit previousSplit = null;

           for (; i < splits.size(); ++i) {
               FileSplit fspl = (FileSplit)splits.get(i);
               if (!fspl.getPath().equals(path))
                   break;

               long beg =       fspl.getStart();
               long end = beg + fspl.getLength();

               long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

               // As the guesser goes to the next BGZF block before looking for BAM
               // records, the ending BGZF blocks have to always be traversed fully.
               // Hence force the length to be 0xffff, the maximum possible.
               long alignedEnd = end << 16 | 0xffff;

               if (alignedBeg == end) {
                   // No records detected in this split: merge it to the previous one.
                   // This could legitimately happen e.g. if we have a split that is
                   // so small that it only contains the middle part of a BGZF block.
                   //
                   // Of course, if it's the first split, then this is simply not a
                   // valid BAM file.
                   //
                   // FIXME: In theory, any number of splits could only contain parts
                   // of the BAM header before we start to see splits that contain BAM
                   // records. For now, we require that the split size is at least as
                   // big as the header and don't handle that case.
                   if (previousSplit == null)
                       throw new IOException("'" + path + "': "+
                           "no reads in first split: bad BAM file or tiny split size?");

                   previousSplit.setEndVirtualOffset(alignedEnd);
               } else {
                   previousSplit = new FileVirtualSplit(
                                           path, alignedBeg, alignedEnd, fspl.getLocations());
                   if (logger.isDebugEnabled()) {
                       final long byteOffset  = alignedBeg >>> 16;
                       final long recordOffset = alignedBeg & 0xffff;
                       logger.debug(
                           "Split {}: byte offset: {} record offset: {}, virtual offset: {}",
                           i, byteOffset, recordOffset, alignedBeg);
                   }
                   newSplits.add(previousSplit);
               }
           }
       }
       return i;
}
 
Example #26
Source File: KeyIgnoringVCFOutputFormat.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
public void readHeaderFrom(SeekableStream in) throws IOException {
	this.header = VCFHeaderReader.readHeaderFrom(in);
}
 
Example #27
Source File: ReferenceSequenceFromSeekable.java    From cramtools with Apache License 2.0 4 votes vote down vote up
private ReferenceSequenceFromSeekable(SeekableStream s, Map<String, FastaSequenceIndexEntry> index) {
	this.s = s;
	this.index = index;
}
 
Example #28
Source File: KeyIgnoringVCFOutputFormat.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
public void readHeaderFrom(Path path, FileSystem fs) throws IOException {
	SeekableStream i = WrapSeekable.openPath(fs, path);
	readHeaderFrom(i);
	i.close();
}
 
Example #29
Source File: IndexAggregate.java    From cramtools with Apache License 2.0 3 votes vote down vote up
/**
 * Find and seek the data stream to the position of the alignment query.
 * 
 * @param seqId
 *            reference sequence id
 * @param start
 *            alignment start, 1-based inclusive
 * @param end
 *            alignment end, 1-based exclusive
 * @param cramStream
 *            the data stream to seek in
 * @return the offset found or -1 if the query was not found
 * @throws IOException
 */
public long seek(int seqId, int start, int end, SeekableStream cramStream) throws IOException {
	if (crai != null)
		return seek(crai, seqId, start, end, cramStream);
	if (bai != null)
		return seek(bai, seqId, start, end, cramStream);
	return -1;
}