Java Code Examples for htsjdk.samtools.SamReader#iterator()

The following examples show how to use htsjdk.samtools.SamReader#iterator() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TagValueFilteringIteratorTest.java    From Drop-seq with MIT License 6 votes vote down vote up
@Test
public void testCellBarcodeFiltering () {
	String cellBarcodeTag = "XC";

	SamReader inputSam = SamReaderFactory.makeDefault().open(IN_FILE);
	String [] desiredBarcodes = {"ATCAGGGACAGA", "TTGCCTTACGCG", "TACAATTAAGGC"};

	TagValueFilteringIterator<String> iter = new TagValueFilteringIterator<String>(inputSam.iterator(), cellBarcodeTag, Arrays.asList(desiredBarcodes));
	Set<String> barcodesFound = new HashSet<String>();
	while (iter.hasNext()) {
		SAMRecord r = iter.next();
		String v = r.getStringAttribute(cellBarcodeTag);
		barcodesFound.add(v);
	}

	// should be the same size.
	Assert.assertEquals(barcodesFound.size(), desiredBarcodes.length);
	// should contain the same answers.
	for (String s: desiredBarcodes)
		Assert.assertTrue(barcodesFound.contains(s));

}
 
Example 2
Source File: SamClosedFileReaderTest.java    From rtg-tools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
private void check(final File bam) throws IOException {
  final SamReader normalReader = SamUtils.makeSamReader(bam);
  final Iterator<SAMRecord> norm = normalReader.iterator();
  try {
    try (final RecordIterator<SAMRecord> closed = new SamClosedFileReader(bam, null, null, normalReader.getFileHeader())) {
      while (norm.hasNext() && closed.hasNext()) {
        final SAMRecord a = norm.next();
        final SAMRecord b = closed.next();
        assertEquals(a.getSAMString().trim(), b.getSAMString().trim());
      }
      assertEquals(norm.hasNext(), closed.hasNext());
    }
  } finally {
    normalReader.close();
  }
}
 
Example 3
Source File: SNPUMIBasePileupTest.java    From Drop-seq with MIT License 6 votes vote down vote up
@Test(enabled=true)
public void testAllReadsPileup() {
	SamReader reader = SamReaderFactory.makeDefault().open(smallBAMFile);
	Iterator<SAMRecord> iter = reader.iterator();
	int snpPos=76227022;
	Interval snpInterval = new Interval("HUMAN_1", snpPos, snpPos, true, "test");
	SNPUMIBasePileup p = new SNPUMIBasePileup(snpInterval, "ACADM", "fake_cell", "AAAAA");
	while (iter.hasNext())
		p.addRead(iter.next());

	List<Character> bases= p.getBasesAsCharacters();
	ObjectCounter<Character> baseCounter = new ObjectCounter<>();
	for (Character c: bases)
		baseCounter.increment(c);
	Assert.assertEquals(6, baseCounter.getCountForKey('A'));
	Assert.assertEquals(7, baseCounter.getCountForKey('G'));

	Assert.assertNotNull(p.toString());

}
 
Example 4
Source File: FastqToSamTest.java    From picard with MIT License 6 votes vote down vote up
private void convertFileAndVerifyRecordCount(final int expectedCount,
                   final String fastqFilename1,
                   final String fastqFilename2,
                   final FastqQualityFormat version,
                   final boolean permissiveFormat,
                   final boolean useSequentialFastqs) throws IOException {
    final File samFile = convertFile(fastqFilename1, fastqFilename2, version, permissiveFormat, useSequentialFastqs);
    final SamReader samReader = SamReaderFactory.makeDefault().open(samFile);
    final SAMRecordIterator iterator = samReader.iterator();
    int actualCount = 0;
    while (iterator.hasNext()) {
        iterator.next();
        actualCount++;
    }
    samReader.close();
    Assert.assertEquals(actualCount, expectedCount);
}
 
Example 5
Source File: Reader.java    From dataflow-java with Apache License 2.0 6 votes vote down vote up
void openFile() throws IOException {
  LOG.info("Processing shard " + shard);
  final SamReader reader = BAMIO.openBAM(storageClient, shard.file,
      options.getStringency());
  iterator = null;
  if (reader.hasIndex() && reader.indexing() != null) {
    if (filter == Filter.UNMAPPED_ONLY) {
      LOG.info("Processing unmapped");
      iterator = reader.queryUnmapped();
    } else if (shard.span != null) {
      LOG.info("Processing span for " + shard.contig);
      iterator = reader.indexing().iterator(shard.span);
    } else if (shard.contig.referenceName != null && !shard.contig.referenceName.isEmpty()) {
      LOG.info("Processing all bases for " + shard.contig);
      iterator = reader.query(shard.contig.referenceName, (int) shard.contig.start,
          (int) shard.contig.end, false);
    }
  }
  if (iterator == null) {
    LOG.info("Processing all reads");
    iterator = reader.iterator();
  }
}
 
Example 6
Source File: CleanSam.java    From picard with MIT License 5 votes vote down vote up
/**
 * Do the work after command line has been parsed.
 * RuntimeException may be thrown by this method, and are reported appropriately.
 *
 * @return program exit status.
 */
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    final SamReaderFactory factory = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE);
    if (VALIDATION_STRINGENCY == ValidationStringency.STRICT) {
        factory.validationStringency(ValidationStringency.LENIENT);
    }
    final SamReader reader = factory.open(INPUT);
    final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT);
    final CloseableIterator<SAMRecord> it = reader.iterator();
    final ProgressLogger progress = new ProgressLogger(Log.getInstance(CleanSam.class));

    // If the read (or its mate) maps off the end of the alignment, clip it
    while (it.hasNext()) {
        final SAMRecord rec = it.next();

        // If the read (or its mate) maps off the end of the alignment, clip it
        AbstractAlignmentMerger.createNewCigarsIfMapsOffEndOfReference(rec);

        // check the read's mapping quality
        if (rec.getReadUnmappedFlag() && 0 != rec.getMappingQuality()) {
            rec.setMappingQuality(0);
        }

        writer.addAlignment(rec);
        progress.record(rec);
    }

    writer.close();
    it.close();
    CloserUtil.close(reader);
    return 0;
}
 
Example 7
Source File: CollectJumpingLibraryMetrics.java    From picard with MIT License 5 votes vote down vote up
/**
 * Calculates the mode for outward-facing pairs, using the first SAMPLE_FOR_MODE
 * outward-facing pairs found in INPUT
 */
private double getOutieMode() {

    int samplePerFile = SAMPLE_FOR_MODE / INPUT.size();

    Histogram<Integer> histo = new Histogram<Integer>();

    for (File f : INPUT) {
        SamReader reader = SamReaderFactory.makeDefault().open(f);
        int sampled = 0;
        for (Iterator<SAMRecord> it = reader.iterator(); it.hasNext() && sampled < samplePerFile; ) {
            SAMRecord sam = it.next();
            if (!sam.getFirstOfPairFlag()) {
                continue;
            }
            // If we get here we've hit the end of the aligned reads
            if (sam.getReadUnmappedFlag() && sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
                break;
            } else if (sam.getReadUnmappedFlag() || sam.getMateUnmappedFlag()) {
                continue;
            } else if ((sam.getAttribute(SAMTag.MQ.name()) == null ||
                    sam.getIntegerAttribute(SAMTag.MQ.name()) >= MINIMUM_MAPPING_QUALITY) &&
                    sam.getMappingQuality() >= MINIMUM_MAPPING_QUALITY &&
                    sam.getMateNegativeStrandFlag() != sam.getReadNegativeStrandFlag() &&
                    sam.getMateReferenceIndex().equals(sam.getReferenceIndex()) &&
                    SamPairUtil.getPairOrientation(sam) == PairOrientation.RF) {
                histo.increment(Math.abs(sam.getInferredInsertSize()));
                sampled++;
            }
        }
        CloserUtil.close(reader);
    }

    return histo.size() > 0 ? histo.getMode() : 0;
}
 
Example 8
Source File: SNPUMIBasePileupTest.java    From Drop-seq with MIT License 5 votes vote down vote up
private SAMRecord getReadByName (final String readName, final File bamFile) {
	SamReader reader = SamReaderFactory.makeDefault().open(bamFile);
	Iterator<SAMRecord> iter = reader.iterator();
	while (iter.hasNext()) {
		SAMRecord r1 = iter.next();
		if (r1.getReadName().equals(readName))
			return (r1);
	}
	return null;
}
 
Example 9
Source File: SNPUMICellReadIteratorWrapperTest.java    From Drop-seq with MIT License 5 votes vote down vote up
/**
 * Integration style test to see if I can throw null pointers, etc with a bit of real data.
 */
@Test(enabled=true)
public void processReads() {
	List<String> cellBarcodeList = ParseBarcodeFile.readCellBarcodeFile(cellBCFile);
	IntervalList loci = IntervalList.fromFile(snpIntervals);

       SamReader reader = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.EAGERLY_DECODE).open(bamFile);
       // Filter records before sorting, to reduce I/O
       final MissingTagFilteringIterator filteringIterator =
               new MissingTagFilteringIterator(reader.iterator(), GENE_NAME_TAG, cellBarcodeTag, molBCTag);

       MapQualityFilteredIterator filteringIterator2 = new MapQualityFilteredIterator(filteringIterator, readMQ, true);

       GeneFunctionIteratorWrapper gfteratorWrapper = new GeneFunctionIteratorWrapper(filteringIterator2, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, false, STRAND_STRATEGY, LOCUS_FUNCTION_LIST);

       SNPUMICellReadIteratorWrapper snpumiCellReadIterator = new SNPUMICellReadIteratorWrapper(gfteratorWrapper, loci, cellBarcodeTag, cellBarcodeList, GENE_NAME_TAG, snpTag, readMQ);

       // create comparators in the order the data should be sorted
       final MultiComparator<SAMRecord> multiComparator = new MultiComparator<>(
               new StringTagComparator(cellBarcodeTag),
               new StringTagComparator(GENE_NAME_TAG),
               new StringTagComparator(molBCTag));

       final CloseableIterator<SAMRecord> sortingIterator =
               SamRecordSortingIteratorFactory.create(reader.getFileHeader(), snpumiCellReadIterator, multiComparator, null);

       Assert.assertTrue(sortingIterator.hasNext());
       SAMRecord nextRead = sortingIterator.next();
       List<SAMTagAndValue> tagValues= nextRead.getAttributes();
       String snpTagValue = nextRead.getStringAttribute(this.snpTag);
	CloserUtil.close(snpumiCellReadIterator);
}
 
Example 10
Source File: SNPUMICellReadIteratorWrapperTest.java    From Drop-seq with MIT License 5 votes vote down vote up
private SAMRecord getReadByName (final String name, final File bamFile) {
	SamReader reader = SamReaderFactory.makeDefault().open(bamFile);
	Iterator<SAMRecord> iter = reader.iterator();
	while (iter.hasNext()) {
		SAMRecord r = iter.next();
		if (r.getReadName().equals(name)) return (r);
	}
	return null;
}
 
Example 11
Source File: AggregatedTagOrderIteratorTest.java    From Drop-seq with MIT License 5 votes vote down vote up
private Iterator<List<SAMRecord>> filterSortAndGroupByTags(final File bamFile, final String...tags) {
    final SamReader reader = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.EAGERLY_DECODE).open(bamFile);
    final Iterator<SAMRecord> filteringIterator = new MissingTagFilteringIterator(reader.iterator(), tags);
    final List<Comparator<SAMRecord>> comparators = new ArrayList<>(tags.length);
    for (final String tag: tags) {
        comparators.add(new StringTagComparator(tag));
    }
    final MultiComparator<SAMRecord> comparator = new MultiComparator<>(comparators);
    final CloseableIterator<SAMRecord> sortingIterator =
            SamRecordSortingIteratorFactory.create(reader.getFileHeader(), filteringIterator, comparator, null);
    return new GroupingIterator<>(sortingIterator, comparator);
}
 
Example 12
Source File: AggregatedTagOrderIteratorTest.java    From Drop-seq with MIT License 5 votes vote down vote up
private Iterator<List<SAMRecord>> filterSortAndGroupByTagsAndQuality(final File bamFile, final String...tags) {
    final SamReader reader = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.EAGERLY_DECODE).open(bamFile);
    // Stack the two filters
    final Iterator<SAMRecord> filteringIterator = new MapQualityFilteredIterator(new MissingTagFilteringIterator(reader.iterator(), tags), 10, true);
    final List<Comparator<SAMRecord>> comparators = new ArrayList<>(tags.length);
    for (final String tag: tags) {
        comparators.add(new StringTagComparator(tag));
    }
    final MultiComparator<SAMRecord> comparator = new MultiComparator<>(comparators);
    final CloseableIterator<SAMRecord> sortingIterator =
            SamRecordSortingIteratorFactory.create(reader.getFileHeader(), filteringIterator, comparator, null);
    return new GroupingIterator<>(sortingIterator, comparator);
}
 
Example 13
Source File: SplitSamByLibraryTest.java    From picard with MIT License 5 votes vote down vote up
private int countReads(File samFile) {
    SamReader reader = SamReaderFactory.makeDefault().open(samFile);
    int count = 0;
    for (Iterator it = reader.iterator(); it.hasNext(); ) {
        it.next();
        count++;
    }
    CloserUtil.close(reader);
    return count;

}
 
Example 14
Source File: BamToBfqWriter.java    From picard with MIT License 5 votes vote down vote up
/**
 * Writes the binary fastq file(s) to the output directory
 */
public void writeBfqFiles() {

    final SamReader reader = SamReaderFactory.makeDefault().open(bamFile);
    final Iterator<SAMRecord> iterator = reader.iterator();

    // Filter out noise reads and reads that fail the quality filter
    final TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1);
    final FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter();
    final WholeReadClippedFilter clippedFilter = new WholeReadClippedFilter();


    if (!pairedReads) {
        List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
        filters.add(tagFilter);
        filters.add(clippedFilter);
        if (!this.includeNonPfReads) {
            filters.add(qualityFilter);
        }
        writeSingleEndBfqs(iterator, filters);
        codec1.close();
    }
    else {
        writePairedEndBfqs(iterator, tagFilter, qualityFilter, clippedFilter);
        codec1.close();
        codec2.close();
    }
    log.info("Wrote " + wrote + " bfq records.");
    CloserUtil.close(reader);
}
 
Example 15
Source File: ReadLocusReader.java    From abra2 with MIT License 5 votes vote down vote up
public ReadLocusIterator(SamReader samReader, Feature region, int maxDepth) {
       
	this.maxDepth = maxDepth;
	
	if (region != null) {
		samIter = new ForwardShiftInsertIterator(samReader.queryOverlapping(region.getSeqname(), (int) region.getStart(), (int) region.getEnd()));
	} else {
		samIter = new ForwardShiftInsertIterator(samReader.iterator());
	}
}
 
Example 16
Source File: CompareBaseQualities.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
protected Object doWork() {
    if (roundDown && (staticQuantizationQuals == null || staticQuantizationQuals.isEmpty())){
        throw new CommandLineException.BadArgumentValue(
                ROUND_DOWN_QUANTIZED_LONG_NAME, "true",
                "This option can only be used if " + STATIC_QUANTIZED_QUALS_LONG_NAME + " is also used.");
    }
    staticQuantizedMapping = constructStaticQuantizedMapping(staticQuantizationQuals, roundDown);

    IOUtil.assertFileIsReadable(samFiles.get(0));
    IOUtil.assertFileIsReadable(samFiles.get(1));

    final SamReaderFactory factory = SamReaderFactory.makeDefault().validationStringency(VALIDATION_STRINGENCY);
    final SamReader reader1 = factory.referenceSequence(REFERENCE_SEQUENCE).open(samFiles.get(0));
    final SamReader reader2 = factory.referenceSequence(REFERENCE_SEQUENCE).open(samFiles.get(1));

    final SecondaryOrSupplementarySkippingIterator it1 =
            new SecondaryOrSupplementarySkippingIterator(reader1.iterator());
    final SecondaryOrSupplementarySkippingIterator it2 =
            new SecondaryOrSupplementarySkippingIterator(reader2.iterator());

    final CompareMatrix finalMatrix = new CompareMatrix(staticQuantizedMapping);

    final ProgressMeter progressMeter = new ProgressMeter(1.0);
    progressMeter.start();

    while (it1.hasCurrent() && it2.hasCurrent()) {
        final SAMRecord read1= it1.getCurrent();
        final SAMRecord read2= it2.getCurrent();

        progressMeter.update(read1);

        if (!read1.getReadName().equals(read2.getReadName())){
            throw new UserException.BadInput("files do not have the same exact order of reads:" +
                    read1.getReadName() + " vs " + read2.getReadName());
        }

        finalMatrix.add(read1.getBaseQualities(), read2.getBaseQualities());
        it1.advance();
        it2.advance();
    }
    progressMeter.stop();

    if (it1.hasCurrent() || it2.hasCurrent()){
        throw new UserException.BadInput("files do not have the same exact number of reads");
    }
    CloserUtil.close(reader1);
    CloserUtil.close(reader2);

    finalMatrix.printOutResults(outputFilename);

    if (throwOnDiff && finalMatrix.hasNonDiagonalElements()) {
        throw new UserException("Quality scores from the two BAMs do not match");
    }

    return finalMatrix.hasNonDiagonalElements() ? 1 : 0;
}
 
Example 17
Source File: BamToBfqWriter.java    From picard with MIT License 4 votes vote down vote up
/**
 * Count the number of records in the bamFile that could potentially be written
 *
 * @return  the number of records in the Bam file
 */
private int countWritableRecords() {
    int count = 0;

    final SamReader reader = SamReaderFactory.makeDefault().open(this.bamFile);
    if(!reader.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) {
    	//this is a fix for issue PIC-274: It looks like BamToBfqWriter requires that the input BAM is queryname sorted, 
    	//but it doesn't check this early, nor produce an understandable error message."
    	throw new PicardException("Input file (" + this.bamFile.getAbsolutePath() +") needs to be sorted by queryname.");
    }
    final PeekableIterator<SAMRecord> it = new PeekableIterator<SAMRecord>(reader.iterator());
    if (!this.pairedReads) {
        // Filter out noise reads and reads that fail the quality filter
        final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
        filters.add(new TagFilter(ReservedTagConstants.XN, 1));
        if (!this.includeNonPfReads) {
            filters.add(new FailsVendorReadQualityFilter());
        }
        final FilteringSamIterator itr = new FilteringSamIterator(it, new AggregateFilter(filters));
        while (itr.hasNext()) {
            itr.next();
            count++;
        }
    }
    else {
        while (it.hasNext()) {
            final SAMRecord first = it.next();
            final SAMRecord second = it.next();
            // If both are noise reads, filter them out
            if (first.getAttribute(ReservedTagConstants.XN) != null &&
                second.getAttribute(ReservedTagConstants.XN) != null)  {
                // skip it
            }
            // If either fails to pass filter, then exclude them as well
            else if (!this.includeNonPfReads && (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag()) ) {
                // skip it
            }
            // Otherwise, write them out
            else {
                count++;
            }
        }
    }
    it.close();
    CloserUtil.close(reader);
    return count;
}
 
Example 18
Source File: MarkIlluminaAdapters.java    From picard with MIT License 4 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(METRICS);

    final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
    final SAMFileHeader.SortOrder order = in.getFileHeader().getSortOrder();
    SAMFileWriter out = null;
    if (OUTPUT != null) {
        IOUtil.assertFileIsWritable(OUTPUT);
        out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
    }

    final Histogram<Integer> histo = new Histogram<Integer>("clipped_bases", "read_count");

    // Combine any adapters and custom adapter pairs from the command line into an array for use in clipping
    final AdapterPair[] adapters;
    {
        final List<AdapterPair> tmp = new ArrayList<AdapterPair>();
        tmp.addAll(ADAPTERS);
        if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
            tmp.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
        }
        adapters = tmp.toArray(new AdapterPair[tmp.size()]);
    }

    ////////////////////////////////////////////////////////////////////////
    // Main loop that consumes reads, clips them and writes them to the output
    ////////////////////////////////////////////////////////////////////////
    final ProgressLogger progress = new ProgressLogger(log, 1000000, "Read");
    final SAMRecordIterator iterator = in.iterator();

    final AdapterMarker adapterMarker = new AdapterMarker(ADAPTER_TRUNCATION_LENGTH, adapters).
            setMaxPairErrorRate(MAX_ERROR_RATE_PE).setMinPairMatchBases(MIN_MATCH_BASES_PE).
            setMaxSingleEndErrorRate(MAX_ERROR_RATE_SE).setMinSingleEndMatchBases(MIN_MATCH_BASES_SE).
            setNumAdaptersToKeep(NUM_ADAPTERS_TO_KEEP).
            setThresholdForSelectingAdaptersToKeep(PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN);

    while (iterator.hasNext()) {
        final SAMRecord rec = iterator.next();
        final SAMRecord rec2 = rec.getReadPairedFlag() && iterator.hasNext() ? iterator.next() : null;
        rec.setAttribute(ReservedTagConstants.XT, null);

        // Do the clipping one way for PE and another for SE reads
        if (rec.getReadPairedFlag()) {
            // Assert that the input file is in query name order only if we see some PE reads
            if (order != SAMFileHeader.SortOrder.queryname) {
                throw new PicardException("Input BAM file must be sorted by queryname");
            }

            if (rec2 == null) throw new PicardException("Missing mate pair for paired read: " + rec.getReadName());
            rec2.setAttribute(ReservedTagConstants.XT, null);

            // Assert that we did in fact just get two mate pairs
            if (!rec.getReadName().equals(rec2.getReadName())) {
                throw new PicardException("Adjacent reads expected to be mate-pairs have different names: " +
                        rec.getReadName() + ", " + rec2.getReadName());
            }

            // establish which of pair is first and which second
            final SAMRecord first, second;

            if (rec.getFirstOfPairFlag() && rec2.getSecondOfPairFlag()) {
                first = rec;
                second = rec2;
            } else if (rec.getSecondOfPairFlag() && rec2.getFirstOfPairFlag()) {
                first = rec2;
                second = rec;
            } else {
                throw new PicardException("Two reads with same name but not correctly marked as 1st/2nd of pair: " + rec.getReadName());
            }

            adapterMarker.adapterTrimIlluminaPairedReads(first, second);
        } else {
            adapterMarker.adapterTrimIlluminaSingleRead(rec);
        }

        // Then output the records, update progress and metrics
        for (final SAMRecord r : new SAMRecord[]{rec, rec2}) {
            if (r != null) {
                progress.record(r);
                if (out != null) out.addAlignment(r);

                final Integer clip = r.getIntegerAttribute(ReservedTagConstants.XT);
                if (clip != null) histo.increment(r.getReadLength() - clip + 1);
            }
        }
    }

    if (out != null) out.close();

    // Lastly output the metrics to file
    final MetricsFile<?, Integer> metricsFile = getMetricsFile();
    metricsFile.setHistogram(histo);
    metricsFile.write(METRICS);

    CloserUtil.close(in);
    return 0;
}
 
Example 19
Source File: WriteBAIFn.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
@ProcessElement
public void processElement(DoFn<String, String>.ProcessContext c) throws Exception {
  Metrics.counter(WriteBAIFn.class, "Initialized Indexing Shard Count").inc();
  Stopwatch stopWatch = Stopwatch.createStarted();

  final HeaderInfo header = c.sideInput(headerView);
  final String bamFilePath = c.sideInput(writtenBAMFilerView);
  final Iterable<KV<Integer, Long>> sequenceShardSizes = c.sideInput(sequenceShardSizesView);

  final String sequenceName = c.element();
  final int sequenceIndex = header.header.getSequence(sequenceName).getSequenceIndex();
  final String baiFilePath = bamFilePath + "-" +
      String.format("%02d",sequenceIndex) + "-" + sequenceName + ".bai";

  long offset = 0;
  int skippedReferences  = 0;
  long bytesToProcess = 0;

  for (KV<Integer, Long> sequenceShardSize : sequenceShardSizes) {
    if (sequenceShardSize.getKey() < sequenceIndex) {
      offset += sequenceShardSize.getValue();
      skippedReferences++;
    } else if (sequenceShardSize.getKey() == sequenceIndex) {
      bytesToProcess = sequenceShardSize.getValue();
    }
  }
  LOG.info("Generating BAI index: " + baiFilePath);
  LOG.info("Reading BAM file: " + bamFilePath + " for reference " + sequenceName +
      ", skipping " + skippedReferences + " references at offset " + offset +
      ", expecting to process " + bytesToProcess + " bytes");

  Options options = c.getPipelineOptions().as(Options.class);
  final Storage.Objects storage = Transport.newStorageClient(
      options
        .as(GCSOptions.class))
        .build()
        .objects();
  final SamReader reader = BAMIO.openBAM(storage, bamFilePath, ValidationStringency.SILENT, true,
      offset);
  final OutputStream outputStream =
      Channels.newOutputStream(
          new GcsUtil.GcsUtilFactory().create(options)
            .create(GcsPath.fromUri(baiFilePath),
                BAMIO.BAM_INDEX_FILE_MIME_TYPE));
  final BAMShardIndexer indexer = new BAMShardIndexer(outputStream, reader.getFileHeader(), sequenceIndex);

  long processedReads = 0;
  long skippedReads = 0;

  // create and write the content
  if (bytesToProcess > 0) {
    SAMRecordIterator it = reader.iterator();
    boolean foundRecords = false;
    while (it.hasNext()) {
      SAMRecord r = it.next();
      if (!r.getReferenceName().equals(sequenceName)) {
        if (foundRecords) {
          LOG.info("Finishing index building for " + sequenceName + " after processing " + processedReads);
          break;
        }
        skippedReads++;
        continue;
      } else if (!foundRecords) {
        LOG.info("Found records for refrence " + sequenceName + " after skipping " + skippedReads);
        foundRecords = true;
      }
      indexer.processAlignment(r);
      processedReads++;
    }
    it.close();
  } else {
    LOG.info("No records for refrence " + sequenceName + ": writing empty index ");
  }
  long noCoordinateReads = indexer.finish();
  c.output(baiFilePath);
  c.output(NO_COORD_READS_COUNT_TAG, noCoordinateReads);
  LOG.info("Generated " + baiFilePath + ", " + processedReads + " reads, " +
      noCoordinateReads + " no coordinate reads, " + skippedReads + ", skipped reads");
  stopWatch.stop();
  Metrics.distribution(WriteBAIFn.class, "Indexing Shard Processing Time (sec)").update(
      stopWatch.elapsed(TimeUnit.SECONDS));
  Metrics.counter(WriteBAIFn.class, "Finished Indexing Shard Count").inc();
  Metrics.counter(WriteBAIFn.class, "Indexed reads").inc(processedReads);
  Metrics.counter(WriteBAIFn.class, "Indexed no coordinate reads").inc(noCoordinateReads);
  Metrics.distribution(WriteBAIFn.class, "Reads Per Indexing Shard").update(processedReads);
}
 
Example 20
Source File: ReAligner.java    From abra2 with MIT License 4 votes vote down vote up
private void getSamHeaderAndReadLength() throws IOException {
	
	Logger.info("Identifying header and determining read length");
	
	this.samHeaders = new SAMFileHeader[this.inputSams.length];
	
	for (int i=0; i<this.inputSams.length; i++) {
	
		SamReader reader = SAMRecordUtils.getSamReader(inputSams[i]);
		try {
			samHeaders[i] = reader.getFileHeader();
			samHeaders[i].setSortOrder(SAMFileHeader.SortOrder.unsorted);
			
			Iterator<SAMRecord> iter = reader.iterator();
			
			int cnt = 0;
			while ((iter.hasNext()) && (cnt < 1000000)) {
				SAMRecord read = iter.next();
				this.readLength = Math.max(this.readLength, read.getReadLength());
				this.maxMapq = Math.max(this.maxMapq, read.getMappingQuality());
				
				// Assumes aligner sets proper pair flag correctly
				if ((isPairedEnd) && (read.getReadPairedFlag()) && (read.getProperPairFlag())) {
					this.minInsertLength = Math.min(this.minInsertLength, Math.abs(read.getInferredInsertSize()));
					this.maxInsertLength = Math.max(this.maxInsertLength, Math.abs(read.getInferredInsertSize()));
				}
				
				cnt += 1;
			}
			
			// Allow some fudge in insert length
			minInsertLength = Math.max(minInsertLength - 2*readLength, 0);
			maxInsertLength = maxInsertLength + 2*readLength;
			
		} finally {
			reader.close();
		}
	}
	
	Logger.info("Min insert length: " + minInsertLength);
	Logger.info("Max insert length: " + maxInsertLength);
			
	Logger.info("Max read length is: " + readLength);
	if (assemblerSettings.getMinContigLength() < 1) {
		assemblerSettings.setMinContigLength(Math.max(readLength+1, MIN_CONTIG_LENGTH));
	}
	Logger.info("Min contig length: " + assemblerSettings.getMinContigLength());
}