htsjdk.samtools.Chunk Java Examples

The following examples show how to use htsjdk.samtools.Chunk. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: BamSlicerApplication.java From hmftools with GNU General Public License v3.0

6 votes

@NotNull
private static Optional<Chunk> getUnmappedChunk(@NotNull BAMIndex bamIndex, @Nullable String contentLengthString,
        @NotNull CommandLine cmd) {
    if (cmd.hasOption(UNMAPPED)) {
        long startOfLastLinearBin = bamIndex.getStartOfLastLinearBin();
        if (startOfLastLinearBin == -1) {
            LOGGER.warn("Start of last linear bin was -1. No mapped reads found in BAM.");
            return Optional.empty();
        }
        if (contentLengthString != null) {
            try {
                long contentLength = Long.parseLong(contentLengthString);
                // We multiply content length with 2^16 = ~64k. Presumably 'content length' is "in terms of number of 64Kb packets".
                return Optional.of(new Chunk(startOfLastLinearBin, contentLength << 16));
            } catch (NumberFormatException ignored) {
                LOGGER.error("Invalid content length ({}) for bam URL", contentLengthString);
                return Optional.empty();
            }
        }
    }
    return Optional.empty();
}

Example #2

Source File: CachingSeekableHTTPStream.java From hmftools with GNU General Public License v3.0

6 votes

CachingSeekableHTTPStream(@NotNull OkHttpClient httpClient, @NotNull URL url, @NotNull List<Chunk> chunks, int maxBufferSize)
        throws IOException {
    // Try to get the file length
    // Note: This also sets setDefaultUseCaches(false), which is important
    String contentLengthString = HttpUtils.getHeaderField(url, "Content-Length");
    if (contentLengthString != null) {
        try {
            contentLength = Long.parseLong(contentLengthString);
        } catch (NumberFormatException ignored) {
            LOGGER.warn("Invalid content length ({})  for: {}", contentLengthString, url);
            contentLength = -1;
        }
    }
    LOGGER.info("Caching max {} bam chunks from {}", maxBufferSize, url);
    chunkBuffer = new ChunkHttpBuffer(httpClient, url, maxBufferSize, chunks);
    LOGGER.info("Updating position to 0.");
    updatePosition(0);
}

Example #3

Source File: ChunkHttpBuffer.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
private ListenableFuture<byte[]> getBytesForChunk(@NotNull Chunk chunk) {
    long start = BlockCompressedFilePointerUtil.getBlockAddress(chunk.getChunkStart());
    long end = BlockCompressedFilePointerUtil.getBlockAddress(chunk.getChunkEnd());
    if (start <= end) {
        return readUrlBytes(start, end - start);
    } else {
        return Futures.immediateFailedFuture(new IllegalArgumentException("start offset is greater than end"));
    }
}

Example #4

Source File: BamSlicerApplication.java From hmftools with GNU General Public License v3.0

5 votes

private static void sliceFromURLs(@NotNull URL indexUrl, @NotNull URL bamUrl, @NotNull CommandLine cmd) throws IOException {
    File indexFile = downloadIndex(indexUrl);
    indexFile.deleteOnExit();

    SamReader reader = createFromCommandLine(cmd).open(SamInputResource.of(bamUrl).index(indexFile));

    BAMIndex bamIndex;
    if (indexFile.getPath().contains(".crai")) {
        SeekableStream craiIndex = CRAIIndex.openCraiFileAsBaiStream(indexFile, reader.getFileHeader().getSequenceDictionary());
        bamIndex = new DiskBasedBAMFileIndex(craiIndex, reader.getFileHeader().getSequenceDictionary());
    } else {
        bamIndex = new DiskBasedBAMFileIndex(indexFile, reader.getFileHeader().getSequenceDictionary(), false);
    }

    Optional<Pair<QueryInterval[], BAMFileSpan>> queryIntervalsAndSpan = queryIntervalsAndSpan(reader, bamIndex, cmd);
    Optional<Chunk> unmappedChunk = getUnmappedChunk(bamIndex, HttpUtils.getHeaderField(bamUrl, "Content-Length"), cmd);
    List<Chunk> sliceChunks = sliceChunks(queryIntervalsAndSpan, unmappedChunk);
    SamReader cachingReader = createCachingReader(indexFile, bamUrl, cmd, sliceChunks);

    SAMFileWriter writer = new SAMFileWriterFactory().setCreateIndex(true)
            .makeBAMWriter(reader.getFileHeader(), true, new File(cmd.getOptionValue(OUTPUT)));

    queryIntervalsAndSpan.ifPresent(pair -> {
        LOGGER.info("Slicing bam on bed regions...");
        CloseableIterator<SAMRecord> bedIterator = getIterator(cachingReader, pair.getKey(), pair.getValue().toCoordinateArray());
        writeToSlice(writer, bedIterator);
        LOGGER.info("Done writing bed slices.");
    });

    unmappedChunk.ifPresent(chunk -> {
        LOGGER.info("Slicing unmapped reads...");
        CloseableIterator<SAMRecord> unmappedIterator = cachingReader.queryUnmapped();
        writeToSlice(writer, unmappedIterator);
        LOGGER.info("Done writing unmapped reads.");
    });

    reader.close();
    writer.close();
    cachingReader.close();
}

Example #5

Source File: BamSlicerApplication.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
private static SamReader createCachingReader(@NotNull File indexFile, @NotNull URL bamUrl, @NotNull CommandLine cmd,
        @NotNull List<Chunk> sliceChunks) throws IOException {
    OkHttpClient httpClient =
            SlicerHttpClient.create(Integer.parseInt(cmd.getOptionValue(MAX_CONCURRENT_REQUESTS, MAX_CONCURRENT_REQUESTS_DEFAULT)));
    int maxBufferSize = readMaxBufferSize(cmd);

    SamInputResource bamResource =
            SamInputResource.of(new CachingSeekableHTTPStream(httpClient, bamUrl, sliceChunks, maxBufferSize)).index(indexFile);
    SamReaderFactory readerFactory = createFromCommandLine(cmd);

    return readerFactory.open(bamResource);
}

Example #6

Source File: BamSlicerApplication.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
private static List<Chunk> sliceChunks(@NotNull Optional<Pair<QueryInterval[], BAMFileSpan>> queryIntervalsAndSpan,
        @NotNull Optional<Chunk> unmappedChunk) {
    List<Chunk> chunks = Lists.newArrayList();
    chunks.add(HEADER_CHUNK);
    queryIntervalsAndSpan.ifPresent(pair -> {
        chunks.addAll(expandChunks(pair.getValue().getChunks()));
        LOGGER.info("Generated {} query intervals which map to {} bam chunks", pair.getKey().length, chunks.size());
    });
    unmappedChunk.ifPresent(chunks::add);
    return Chunk.optimizeChunkList(chunks, 0);
}

Example #7

Source File: BamSlicerApplication.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
private static List<Chunk> expandChunks(@NotNull List<Chunk> chunks) {
    List<Chunk> result = Lists.newArrayList();
    for (Chunk chunk : chunks) {
        long chunkEndBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(chunk.getChunkEnd());
        long extendedEndBlockAddress = chunkEndBlockAddress + BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE;
        long newChunkEnd = Math.min(extendedEndBlockAddress, MAX_BLOCK_ADDRESS);
        long chunkEndVirtualPointer = newChunkEnd << 16;
        result.add(new Chunk(chunk.getChunkStart(), chunkEndVirtualPointer));
    }
    return result;
}

Example #8

Source File: BAMShard.java From dataflow-java with Apache License 2.0

5 votes

/**
 * Appends chunks from another bin to the list and moved the end position.
 */
public void addBin(List<Chunk> chunksToAdd, long lastLocus) {
  assert chunks != null;
  contig = new Contig(contig.referenceName, contig.start, lastLocus);
  chunks.addAll(chunksToAdd);
  updateSpan();
}

Example #9

Source File: BAMShardTest.java From dataflow-java with Apache License 2.0

5 votes

@Test
public void testShardGrowth() {
  BAMShard shard = new BAMShard("f","chr20",1);
  Chunk chunk = new Chunk(1,20);
  shard.addBin(Collections.singletonList(chunk),10);
  assertEquals(9, shard.sizeInLoci());
  assertEquals(19, shard.approximateSizeInBytes());
}

Example #10

Source File: BAMRecordReader.java From Hadoop-BAM with MIT License

4 votes

@Override public void initialize(InputSplit spl, TaskAttemptContext ctx)
           throws IOException
{
	// This method should only be called once (see Hadoop API). However,
	// there seems to be disagreement between implementations that call
	// initialize() and Hadoop-BAM's own code that relies on
	// {@link BAMInputFormat} to call initialize() when the reader is
	// created. Therefore we add this check for the time being. 
	if(isInitialized)
		close();
	isInitialized = true;
	reachedEnd = false;

	final Configuration conf = ctx.getConfiguration();

	final FileVirtualSplit split = (FileVirtualSplit)spl;
	final Path             file  = split.getPath();
	final FileSystem       fs    = file.getFileSystem(conf);

	ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
	boolean useIntelInflater = BAMInputFormat.useIntelInflater(conf);

	java.nio.file.Path index = SamFiles.findIndex(NIOFileUtil.asPath(fs.makeQualified(file).toUri()));
	Path fileIndex = index == null ? null : new Path(index.toUri());
	SeekableStream indexStream = fileIndex == null ? null : WrapSeekable.openPath(fs, fileIndex);
	in = WrapSeekable.openPath(fs, file);
	SamReader samReader = createSamReader(in, indexStream, stringency, useIntelInflater);
	final SAMFileHeader header = samReader.getFileHeader();

	long virtualStart = split.getStartVirtualOffset();

	fileStart  = virtualStart >>> 16;
	virtualEnd = split.getEndVirtualOffset();

	SamReader.PrimitiveSamReader primitiveSamReader =
			((SamReader.PrimitiveSamReaderToSamReaderAdapter) samReader).underlyingReader();
	bamFileReader = (BAMFileReader) primitiveSamReader;

	if (logger.isDebugEnabled()) {
		final long recordStart = virtualStart & 0xffff;
		logger.debug("Initialized BAMRecordReader; byte offset: {}, record offset: {}",
			fileStart, recordStart);
	}

	if (conf.getBoolean("hadoopbam.bam.keep-paired-reads-together", false)) {
		throw new IllegalArgumentException("Property hadoopbam.bam.keep-paired-reads-together is no longer honored.");
	}

	boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf);
	if (boundedTraversal && split.getIntervalFilePointers() != null) {
		// return reads for intervals
		List<Interval> intervals = BAMInputFormat.getIntervals(conf);
		QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary());
		iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers());
	} else if (boundedTraversal && split.getIntervalFilePointers() == null) {
		// return unmapped reads
		iterator = bamFileReader.queryUnmapped();
	} else {
		// return everything
		BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(virtualStart, virtualEnd));
		iterator = bamFileReader.getIterator(splitSpan);
	}
}

Example #11

Source File: BAMShard.java From dataflow-java with Apache License 2.0

4 votes

/**
 * Updates the underlying file span by optimizing and coalescing the current chunk list.
 */
private void updateSpan() {
  span = new SAMFileSpanImpl(Chunk.optimizeChunkList(chunks, this.contig.start));
  cachedSizeInBytes = -1;
}

Example #12

Source File: Sharder.java From dataflow-java with Apache License 2.0

4 votes

void createShardsForReference(SAMSequenceRecord reference, Contig contig) {
  LOG.info("Creating shard for: " + contig);
  final BitSet overlappingBins = GenomicIndexUtil.regionToBins(
      (int) contig.start, (int) contig.end);
  if (overlappingBins == null) {
    LOG.warning("No overlapping bins for " + contig.start + ":" + contig.end);
    return;
  }


  BAMShard currentShard = null;
  for (int binIndex = overlappingBins.nextSetBit(0); binIndex >= 0; binIndex = overlappingBins.nextSetBit(binIndex + 1)) {
    final Bin bin = index.getBinData(reference.getSequenceIndex(), binIndex);
    if (bin == null) {
      continue;
    }
    if (LOG.isLoggable(Level.FINE)) {
      LOG.fine("Processing bin " + index.getFirstLocusInBin(bin) + "-"
          + index.getLastLocusInBin(bin));
    }

    if (index.getLevelForBin(bin) != (GenomicIndexUtil.LEVEL_STARTS.length - 1)) {
      if (LOG.isLoggable(Level.FINEST)) {
        LOG.finest("Skipping - not lowest");
      }
      continue;
      // Skip non-lowest level bins
      // Its ok to skip higher level bins
      // because in BAMShard#finalize we
      // will get all chunks overlapping a genomic region that
      // we end up having for the shard, so we will not
      // miss any reads on the boundary.
    }
    final List<Chunk> chunksInBin = bin.getChunkList();
    if (chunksInBin.isEmpty()) {
      if (LOG.isLoggable(Level.FINEST)) {
        LOG.finest("Skipping - empty");
      }
      continue; // Skip empty bins
    }

    if (currentShard == null) {
      int startLocus =
          Math.max(index.getFirstLocusInBin(bin), (int)contig.start);

      if (LOG.isLoggable(Level.FINE)) {
        LOG.fine("Creating shard starting from " + startLocus);
      }
      currentShard = new BAMShard(filePath, reference.getSequenceName(),
          startLocus);
    }
    currentShard.addBin(chunksInBin, index.getLastLocusInBin(bin));
    if (LOG.isLoggable(Level.FINE)) {
      LOG.fine("Extending the shard  to " + index.getLastLocusInBin(bin));
    }

    if (shardingPolicy.apply(currentShard)) {
      LOG.info("Shard size is big enough to finalize: " +
          currentShard.sizeInLoci() + ", " + currentShard.approximateSizeInBytes() + " bytes");
      final BAMShard bamShard = currentShard.finalize(index, Math.min(index.getLastLocusInBin(bin), (int)contig.end));
      LOG.info("Outputting shard: " + bamShard.contig);
      output.output(bamShard);
      currentShard = null;
    }
  }
  if (currentShard != null) {
    LOG.info("Outputting last shard of size " +
        currentShard.sizeInLoci() + ", " + currentShard.approximateSizeInBytes() + " bytes "
        + currentShard.contig);
    output.output(currentShard.finalize(index, (int)contig.end));
  }
}