htsjdk.samtools.util.BlockCompressedStreamConstants Java Examples

The following examples show how to use htsjdk.samtools.util.BlockCompressedStreamConstants. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SamUtils.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * @param file the file to check.
 * @return true if this looks like a BAM file.
 * @throws IOException if an IO Error occurs
 */
public static boolean isBAMFile(final File file) throws IOException {
  final boolean result;
  try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file))) {
    if (!BlockCompressedInputStream.isValidFile(bis)) {
      return false;
    }
    final int buffSize = BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE;
    bis.mark(buffSize);
    final byte[] buffer = new byte[buffSize];
    final int len = IOUtils.readAmount(bis, buffer, 0, buffSize);
    bis.reset();
    final byte[] magicBuf = new byte[4];
    final int magicLength = IOUtils.readAmount(new BlockCompressedInputStream(new ByteArrayInputStream(buffer, 0, len)), magicBuf, 0, 4);
    //checks we read 4 bytes and they were "BAM\1" in ascii
    result = magicLength == 4 && Arrays.equals(new byte[]{(byte) 66, (byte) 65, (byte) 77, (byte) 1}, magicBuf);

  }
  return result;
}
 
Example #2
Source File: TestBGZFSplitGuesser.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
@Test
public void test() throws IOException {
  Configuration conf = new Configuration();
  Path path = new Path(file.toURI());
  FSDataInputStream fsDataInputStream = path.getFileSystem(conf).open(path);
  BGZFSplitGuesser bgzfSplitGuesser = new BGZFSplitGuesser(fsDataInputStream);
  LinkedList<Long> boundaries = new LinkedList<>();
  long start = 1;
  while (true) {
    long end = file.length();
    long nextStart = bgzfSplitGuesser.guessNextBGZFBlockStart(start, end);
    if (nextStart == end) {
      break;
    }
    boundaries.add(nextStart);
    canReadFromBlockStart(nextStart);
    start = nextStart + 1;
  }
  assertEquals(firstSplit, (long) boundaries.getFirst());
  assertEquals(lastSplit, (long) boundaries.getLast());

  assertEquals("Last block start is terminator gzip block",
      file.length() - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length,
      (long) boundaries.get(boundaries.size() - 1));
}
 
Example #3
Source File: TestBAMOutputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
private ByteArrayInputStream mergeBAMBlockStream(
    final File blockStreamFile,
    final SAMFileHeader header) throws IOException
{
    // assemble a proper BAM file from the block stream shard(s) in
    // order to verify the contents
    final ByteArrayOutputStream bamOutputStream = new ByteArrayOutputStream();

    // write out the bam file header
    new SAMOutputPreparer().prepareForRecords(
        bamOutputStream,
        SAMFormat.BAM,
        header);

    // copy the contents of the block shard(s) written out by the M/R job
    final ByteArrayOutputStream blockOutputStream = new ByteArrayOutputStream();
    Files.copy(blockStreamFile.toPath(), blockOutputStream);
    blockOutputStream.writeTo(bamOutputStream);

    // add the BGZF terminator
    bamOutputStream.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    bamOutputStream.close();

    return new ByteArrayInputStream(bamOutputStream.toByteArray());
}
 
Example #4
Source File: BamSlicerApplication.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static List<Chunk> expandChunks(@NotNull List<Chunk> chunks) {
    List<Chunk> result = Lists.newArrayList();
    for (Chunk chunk : chunks) {
        long chunkEndBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(chunk.getChunkEnd());
        long extendedEndBlockAddress = chunkEndBlockAddress + BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE;
        long newChunkEnd = Math.min(extendedEndBlockAddress, MAX_BLOCK_ADDRESS);
        long chunkEndVirtualPointer = newChunkEnd << 16;
        result.add(new Chunk(chunk.getChunkStart(), chunkEndVirtualPointer));
    }
    return result;
}
 
Example #5
Source File: SAMFileMerger.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
private static void writeTerminatorBlock(final OutputStream out, final SAMFormat samOutputFormat) throws IOException {
  if (SAMFormat.CRAM == samOutputFormat) {
    CramIO.issueEOF(CramVersions.DEFAULT_CRAM_VERSION, out); // terminate with CRAM EOF container
  } else if (SAMFormat.BAM == samOutputFormat) {
    out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); // add the BGZF terminator
  }
  // no terminator for SAM
}
 
Example #6
Source File: PrintBGZFBlockInformation.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private BGZFBlockMetadata processNextBlock(InputStream stream, String streamSource) throws IOException {
    final byte[] buffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
    long blockAddress = streamOffset;

    final int headerByteCount = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);

    // Return null when we hit EOF
    if ( headerByteCount <= 0 ) {
        return null;
    }
    if (headerByteCount != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
        throw new IOException("Incorrect header size for file: " + streamSource);
    }
    streamOffset += headerByteCount;

    final int blockLength = unpackInt16(buffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;

    if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > buffer.length) {
        throw new IOException("Unexpected compressed block length: " + blockLength + " for " + streamSource);
    }

    final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
    final int dataByteCount = readBytes(stream, buffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH,
            remaining);

    if (dataByteCount != remaining) {
        throw new IOException("Premature end of file: " + streamSource);
    }
    streamOffset += dataByteCount;

    final int uncompressedLength = unpackInt32(buffer, blockLength - 4);

    if (uncompressedLength < 0) {
        throw new IOException(streamSource + " has invalid uncompressed length: " + uncompressedLength);
    }

    return new BGZFBlockMetadata(blockAddress, blockLength, uncompressedLength);
}
 
Example #7
Source File: MergedVcfFile.java    From imputationserver with GNU Affero General Public License v3.0 4 votes vote down vote up
public void close() throws IOException {
	output.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
	output.close();
}
 
Example #8
Source File: VCFFileMerger.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
private static void writeTerminatorBlock(final OutputStream out) throws IOException {
  out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); // add the BGZF terminator
}
 
Example #9
Source File: WriteBAMTransform.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<String> expand(PCollectionTuple tuple) {
  final PCollection<HeaderInfo> header = tuple.get(HEADER_TAG);
  final PCollectionView<HeaderInfo> headerView =
      header.apply(View.<HeaderInfo>asSingleton());

  final PCollection<Read> shardedReads = tuple.get(SHARDED_READS_TAG);

  final PCollectionTuple writeBAMFilesResult =
      shardedReads.apply("Write BAM shards", ParDo
        .of(new WriteBAMFn(headerView))
        .withSideInputs(Arrays.asList(headerView))
        .withOutputTags(WriteBAMFn.WRITTEN_BAM_NAMES_TAG, TupleTagList.of(WriteBAMFn.SEQUENCE_SHARD_SIZES_TAG)));

  PCollection<String> writtenBAMShardNames = writeBAMFilesResult.get(WriteBAMFn.WRITTEN_BAM_NAMES_TAG);
  final PCollectionView<Iterable<String>> writtenBAMShardsView =
      writtenBAMShardNames.apply(View.<String>asIterable());

  final PCollection<KV<Integer, Long>> sequenceShardSizes = writeBAMFilesResult.get(WriteBAMFn.SEQUENCE_SHARD_SIZES_TAG);
  final PCollection<KV<Integer, Long>> sequenceShardSizesCombined = sequenceShardSizes.apply(
      Combine.<Integer, Long, Long>perKey(Sum.ofLongs()));
  final PCollectionView<Iterable<KV<Integer, Long>>> sequenceShardSizesView =
      sequenceShardSizesCombined.apply(View.<KV<Integer, Long>>asIterable());

  final PCollection<String> destinationBAMPath = this.pipeline.apply(
      Create.<String>of(this.output));

  final PCollectionView<byte[]> eofForBAM = pipeline.apply(
      Create.<byte[]>of(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK))
      .apply(View.<byte[]>asSingleton());

  final PCollection<String> writtenBAMFile = destinationBAMPath.apply(
      "Combine BAM shards", ParDo
        .of(new CombineShardsFn(writtenBAMShardsView, eofForBAM))
        .withSideInputs(writtenBAMShardsView, eofForBAM));

  final PCollectionView<String> writtenBAMFileView =
      writtenBAMFile.apply(View.<String>asSingleton());

  final PCollection<String> indexShards = header.apply(
      "Generate index shard tasks", ParDo
      .of(new GetReferencesFromHeaderFn()));

  final PCollectionTuple indexingResult = indexShards
      .apply(new BreakFusionTransform<String>())
      .apply(
        "Write index shards", ParDo
          .of(new WriteBAIFn(headerView, writtenBAMFileView, sequenceShardSizesView))
          .withSideInputs(headerView, writtenBAMFileView, sequenceShardSizesView)
          .withOutputTags(WriteBAIFn.WRITTEN_BAI_NAMES_TAG,
              TupleTagList.of(WriteBAIFn.NO_COORD_READS_COUNT_TAG)));

  final PCollection<String> writtenBAIShardNames = indexingResult.get(WriteBAIFn.WRITTEN_BAI_NAMES_TAG);
  final PCollectionView<Iterable<String>> writtenBAIShardsView =
      writtenBAIShardNames.apply(View.<String>asIterable());

  final PCollection<Long> noCoordCounts = indexingResult.get(WriteBAIFn.NO_COORD_READS_COUNT_TAG);

  final PCollection<Long> totalNoCoordCount = noCoordCounts
        .apply(new BreakFusionTransform<Long>())
        .apply(
            Combine.globally(Sum.ofLongs()));

  final PCollection<byte[]> totalNoCoordCountBytes = totalNoCoordCount.apply(
      "No coord count to bytes", ParDo.of(new Long2BytesFn()));
  final PCollectionView<byte[]> eofForBAI = totalNoCoordCountBytes
      .apply(View.<byte[]>asSingleton());

  final PCollection<String> destinationBAIPath = this.pipeline.apply(
      Create.<String>of(this.output + ".bai"));

  final PCollection<String> writtenBAIFile = destinationBAIPath.apply(
      "Combine BAI shards", ParDo
        .of(new CombineShardsFn(writtenBAIShardsView, eofForBAI))
        .withSideInputs(writtenBAIShardsView, eofForBAI));

  final PCollection<String> writtenFileNames = PCollectionList.of(writtenBAMFile).and(writtenBAIFile)
      .apply(Flatten.<String>pCollections());

  return writtenFileNames;
}
 
Example #10
Source File: WriteBAMFn.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
@ProcessElement
public void processElement(DoFn<Read, String>.ProcessContext c, BoundedWindow window)
    throws Exception {

  this.window = window;

  if (headerInfo == null) {
    headerInfo = c.sideInput(headerView);
  }
  final Read read = c.element();

  if (readCount == 0) {

    shardContig = KeyReadsFn.shardKeyForRead(read, 1);
    sequenceIndex = headerInfo.header.getSequenceIndex(shardContig.referenceName);
    final boolean isFirstShard = headerInfo.shardHasFirstRead(shardContig);
    final String outputFileName = options.getOutput();
    shardName = outputFileName + "-" + String.format("%012d", sequenceIndex) + "-"
        + shardContig.referenceName
        + ":" + String.format("%012d", shardContig.start);
    LOG.info("Writing shard file " + shardName);
    final OutputStream outputStream =
        Channels.newOutputStream(
            new GcsUtil.GcsUtilFactory().create(options)
              .create(GcsPath.fromUri(shardName),
                  BAMIO.BAM_INDEX_FILE_MIME_TYPE));
    ts = new TruncatedOutputStream(
        outputStream, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
    bw = new BAMBlockWriter(ts, null /*file*/);
    bw.setSortOrder(headerInfo.header.getSortOrder(), true);
    bw.setHeader(headerInfo.header);
    if (isFirstShard) {
      LOG.info("First shard - writing header to " + shardName);
      bw.writeHeader(headerInfo.header);
    }
  }
  SAMRecord samRecord = ReadUtils.makeSAMRecord(read, headerInfo.header);
  if (prevRead != null && prevRead.getAlignmentStart() > samRecord.getAlignmentStart()) {
    LOG.info("Out of order read " + prevRead.getAlignmentStart() + " " +
        samRecord.getAlignmentStart() + " during writing of shard " + shardName +
        " after processing " + readCount + " reads, min seen alignment is " +
        minAlignment + " and max is " + maxAlignment + ", this read is " +
        (samRecord.getReadUnmappedFlag() ? "unmapped" : "mapped") + " and its mate is " +
        (samRecord.getMateUnmappedFlag() ? "unmapped" : "mapped"));
    Metrics.counter(WriteBAMFn.class, "Out of order reads").inc();
    readCount++;
    hadOutOfOrder = true;
    return;
  }
  minAlignment = Math.min(minAlignment, samRecord.getAlignmentStart());
  maxAlignment = Math.max(maxAlignment, samRecord.getAlignmentStart());
  prevRead = samRecord;
  if (samRecord.getReadUnmappedFlag()) {
    if (!samRecord.getMateUnmappedFlag()) {
      samRecord.setReferenceName(samRecord.getMateReferenceName());
      samRecord.setAlignmentStart(samRecord.getMateAlignmentStart());
    }
    unmappedReadCount++;
  }
  bw.addAlignment(samRecord);
  readCount++;
}
 
Example #11
Source File: SparkUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 3 votes vote down vote up
/**
 * Converts a headerless Hadoop bam shard (eg., a part0000, part0001, etc. file produced by
 * {@link org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink}) into a readable bam file
 * by adding a header and a BGZF terminator.
 *
 * This method is not intended for use with Hadoop bam shards that already have a header -- these shards are
 * already readable using samtools. Currently {@link ReadsSparkSink} saves the "shards" with a header for the
 * {@link ReadsWriteFormat#SHARDED} case, and without a header for the {@link ReadsWriteFormat#SINGLE} case.
 *
 * @param bamShard The headerless Hadoop bam shard to convert
 * @param header header for the BAM file to be created
 * @param destination path to which to write the new BAM file
 */
public static void convertHeaderlessHadoopBamShardToBam( final File bamShard, final SAMFileHeader header, final File destination ) {
    try ( FileOutputStream outStream = new FileOutputStream(destination) ) {
        writeBAMHeaderToStream(header, outStream);
        FileUtils.copyFile(bamShard, outStream);
        outStream.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    }
    catch ( IOException e ) {
        throw new UserException("Error writing to " + destination.getAbsolutePath(), e);
    }
}