Java Code Examples for htsjdk.samtools.util.SortingCollection#add()

The following examples show how to use htsjdk.samtools.util.SortingCollection#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SortingIteratorFactory.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 *
 * @param componentType Required because of Java generic syntax limitations.
 * @param underlyingIterator All records are pulled from this iterator, which is then closed if closeable.
 * @param comparator Defines sort order.
 * @param codec For spilling to temp files
 * @param maxRecordsInRam
 * @param progressLogger Pass null if not interested in logging.
 * @return An iterator in the order defined by comparator, that will produce all the records from underlyingIterator.
 */
public static <T> CloseableIterator<T> create(final Class<T> componentType,
                                              final Iterator<T> underlyingIterator,
                                              final Comparator<T> comparator,
                                              final SortingCollection.Codec<T> codec,
                                              final int maxRecordsInRam,
                                              final ProgressCallback progressLogger) {

    SortingCollection<T> sortingCollection =
            SortingCollection.newInstance(componentType, codec, comparator, maxRecordsInRam);

    while (underlyingIterator.hasNext()) {
        final T rec = underlyingIterator.next();
        if (progressLogger != null)
progressLogger.logProgress(rec);
        sortingCollection.add(rec);
    }
    CloseableIterator<T> ret = sortingCollection.iterator();
    CloserUtil.close(underlyingIterator);
    return ret;
}
 
Example 2
Source File: CollapseTagWithContext.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * If the number of records exceeds the number of records allowed in memory, spill to disk.
 * @param groupingIter
 * @param writer
 * @param outMetrics
 * @param header
 */
private void lowMemoryIteration (PeekableGroupingIterator<SAMRecord> groupingIter,									 
								 SAMFileWriter writer, PrintStream outMetrics, SAMFileHeader header) {
	log.info("Running (slower) memory efficient mode");				
       while (groupingIter.hasNext()) {
       	// for this group, get a SortingCollection.  Note that this is not used for sorting.  It is merely
		// an unsorted collection if there might be more objects than can fit in RAM.
       	SortingCollection<SAMRecord> sortingCollection = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), NO_OP_COMPARATOR, this.MAX_RECORDS_IN_RAM);

       	// you have to grab the next element, in case it's the first of the group but not the first group!
       	sortingCollection.add(groupingIter.next()); 
       	
       	// spool the reads for a whole group into the sorting collection to operate on - the code uses a multi-pass approach so we can't just iterate over the grouping iterator.
       	while (groupingIter.hasNextInGroup())         		
       		sortingCollection.add(groupingIter.next());
       	
       	// wrap up the sorting collection for adding records.
       	sortingCollection.doneAdding();
       	sortingCollection.setDestructiveIteration(false);
       	
       	processContext(sortingCollection, writer, false, outMetrics);        	
       }	
}
 
Example 3
Source File: GtcToVcf.java    From picard with MIT License 6 votes vote down vote up
private void fillContexts(final SortingCollection<VariantContext> contexts, final InfiniumGTCFile gtcFile,
                          final Build37ExtendedIlluminaManifest manifest, final InfiniumEGTFile egtFile) {
    final ProgressLogger progressLogger = new ProgressLogger(log, 100000, "sorted");

    final Iterator<Build37ExtendedIlluminaManifestRecord> iterator = manifest.extendedIterator();
    int gtcIndex = 0;

    int numVariantsWritten = 0;

    while (iterator.hasNext()) {
        final Build37ExtendedIlluminaManifestRecord record = iterator.next();

        if (!record.isBad()) {
            InfiniumGTCRecord gtcRecord = gtcFile.getRecord(gtcIndex);
            VariantContext context = makeVariantContext(record, gtcRecord, egtFile, progressLogger);
            numVariantsWritten++;
            contexts.add(context);
        }
        gtcIndex++;
    }

    log.info(numVariantsWritten + " Variants were written to file");
    log.info(gtcFile.getNumberOfSnps() + " SNPs in the GTC file");
    log.info(manifest.getNumAssays() + " Variants on the " + manifest.getDescriptorFileName() + " genotyping array manifest file");
}
 
Example 4
Source File: NewIlluminaBasecallsConverter.java    From picard with MIT License 6 votes vote down vote up
private synchronized void addRecord(final String barcode, final CLUSTER_OUTPUT_RECORD record) {
    // Grab the existing collection, or initialize it if it doesn't yet exist
    SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
    if (recordCollection == null) {
        // TODO: The implementation here for supporting ignoreUnexpectedBarcodes is not efficient,
        // but the alternative is an extensive rewrite.  We are living with the inefficiency for
        // this special case for the time being.
        if (!barcodeRecordWriterMap.containsKey(barcode)) {
            if (ignoreUnexpectedBarcodes) {
                return;
            }
            throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected.  (Is it referenced in the parameters file?)", barcode));
        }
        recordCollection = newSortingCollection();
        this.barcodeToRecordCollection.put(barcode, recordCollection);
    }
    recordCollection.add(record);
}
 
Example 5
Source File: IlluminaBasecallsConverter.java    From picard with MIT License 6 votes vote down vote up
/**
 * Adds the provided record to this tile.
 */
public synchronized void addRecord(final String barcode, final CLUSTER_OUTPUT_RECORD record) {
    this.recordCount += 1;

    // Grab the existing collection, or initialize it if it doesn't yet exist
    SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
    if (recordCollection == null) {
        // TODO: The implementation here for supporting ignoreUnexpectedBarcodes is not efficient,
        // but the alternative is an extensive rewrite.  We are living with the inefficiency for
        // this special case for the time being.
        if (!barcodeRecordWriterMap.containsKey(barcode)) {
            if (ignoreUnexpectedBarcodes) {
                return;
            }
            throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected.  (Is it referenced in the parameters file?)", barcode));
        }
        recordCollection = this.newSortingCollection();
        this.barcodeToRecordCollection.put(barcode, recordCollection);
        this.barcodeToProcessingState.put(barcode, null);
    }
    recordCollection.add(record);
}
 
Example 6
Source File: SortVcf.java    From picard with MIT License 6 votes vote down vote up
/**
 * Merge the inputs and sort them by adding each input's content to a single SortingCollection.
 * <p/>
 * NB: It would be better to have a merging iterator as in MergeSamFiles, as this would perform better for pre-sorted inputs.
 * Here, we are assuming inputs are unsorted, and so adding their VariantContexts iteratively is fine for now.
 * MergeVcfs exists for simple merging of presorted inputs.
 *
 * @param readers      - a list of VCFFileReaders, one for each input VCF
 * @param outputHeader - The merged header whose information we intend to use in the final output file
 */
private SortingCollection<VariantContext> sortInputs(final List<VCFFileReader> readers, final VCFHeader outputHeader) {
    final ProgressLogger readProgress = new ProgressLogger(log, 25000, "read", "records");

    // NB: The default MAX_RECORDS_IN_RAM may not be appropriate here. VariantContexts are smaller than SamRecords
    // We would have to play around empirically to find an appropriate value. We are not performing this optimization at this time.
    final SortingCollection<VariantContext> sorter =
            SortingCollection.newInstance(
                    VariantContext.class,
                    new VCFRecordCodec(outputHeader, VALIDATION_STRINGENCY != ValidationStringency.STRICT),
                    outputHeader.getVCFRecordComparator(),
                    MAX_RECORDS_IN_RAM,
                    TMP_DIR);
    int readerCount = 1;
    for (final VCFFileReader reader : readers) {
        log.info("Reading entries from input file " + readerCount);
        for (final VariantContext variantContext : reader) {
            sorter.add(variantContext);
            readProgress.record(variantContext.getContig(), variantContext.getStart());
        }
        reader.close();
        readerCount++;
    }
    return sorter;
}
 
Example 7
Source File: RevertSam.java    From picard with MIT License 5 votes vote down vote up
void add(final SAMRecord rec) {
    final SortingCollection<SAMRecord> sorter;
    if (outputByReadGroup) {
        sorter = sorterMap.get(rec.getReadGroup().getId());
    } else {
        sorter = singleSorter;
    }
    sorter.add(rec);
}
 
Example 8
Source File: DetectBeadSynthesisErrors.java    From Drop-seq with MIT License 4 votes vote down vote up
/**
 * Find all the cell barcodes that are biased.
 *
 * This walks through many (perhaps all?) of the cell barcodes, and for cell barcodes that have a sufficient number of UMIs, looks to see if there's an error
 * @param iter The BAM iterator to walk through
 * @param out the verbose output stream.
 * @param outSummary The summary output stream.
 *
 * TODO: A less memory-hog version of this would write out the summary file as it runs.  Could even write this out to a SortingIteratorFactory by implementing a codec...
 * Only need to hang onto errors that are SYNTH_MISSING_BASE, and leave the rest null (and check for that when running barcode repair.)
 *
 * @return A collection of biased cell barcodes.
 */
private BiasedBarcodeCollection findBiasedBarcodes (final UMIIterator iter, final PrintStream out, final File outSummary, final Integer lastUMIBase) {
	log.info("Finding Cell Barcodes with UMI errors");
	// Group the stream of UMICollections into groups with the same cell barcode.
       GroupingIterator<UMICollection> groupingIterator = new GroupingIterator<>(iter,
               new Comparator<UMICollection>() {
                   @Override
                   public int compare(final UMICollection o1, final UMICollection o2) {
                       return o1.getCellBarcode().compareTo(o2.getCellBarcode());
                   }
               });


	// for holding barcodes results.  The key is the cell barcode, the value is the first base to pad.
	// Used for cleanup of BAMs.
	Map<String, BeadSynthesisErrorData> errorBarcodesWithPositions = new HashMap<>();

		// for holding UMI Strings efficiently
	// TODO: evaluate if this is needed this anymore now that the report is written out disk immediately.
	StringInterner  umiStringCache = new StringInterner();

	// a sorting collection so big data can spill to disk before it's sorted and written out as a report.
	SortingCollection<BeadSynthesisErrorData> sortingCollection= SortingCollection.newInstance(BeadSynthesisErrorData.class, new BeadSynthesisErrorDataCodec(), new BeadSynthesisErrorData.SizeComparator(), this.MAX_BARCODE_ERRORS_IN_RAM);

       // gather up summary stats
    	BeadSynthesisErrorsSummaryMetric summary = new BeadSynthesisErrorsSummaryMetric();

    	// track the list of cell barcodes with sufficient UMIs to process.  We'll need them later to find intended sequences.
    	ObjectCounter<String> umisPerCellBarcode = new ObjectCounter<>();

    	// track the UMI Bias at the last base
    	Map<String, Double> umiBias = new HashMap<>();

    	// a log for processing
    	ProgressLogger prog = new ProgressLogger(log, 1000000, "Processed Cell/Gene UMIs");

    	// main data generation loop.
    	// to ease memory usage, after generating the BeadSynthesisErrorData object, use its cell barcode string for registering additional data.
       for (final List<UMICollection> umiCollectionList : groupingIterator) {
           BeadSynthesisErrorData bsed = buildBeadSynthesisErrorData(umiCollectionList, umiStringCache, prog);
           // if the cell has too few UMIs, then go to the next cell and skip all processing.
           if (bsed.getNumTranscripts() < this.MIN_UMIS_PER_CELL)
			// not sure I even want to track this...
           	// summary.LOW_UMI_COUNT++;
           	continue;

           // add the result to the summary
           summary=addDataToSummary(bsed, summary);
           umisPerCellBarcode.incrementByCount(bsed.getCellBarcode(), bsed.getNumTranscripts()); // track the cell barcode if it's sufficiently large to process.

           // explicitly call getting the error type.
           BeadSynthesisErrorType errorType=bsed.getErrorType(this.EXTREME_BASE_RATIO, this.detectPrimerTool, this.EDIT_DISTANCE);

           // gather up the UMI bias at the last base.  Note: this can be over-ridden by supplying a last base position.
           double barcodeUMIBias = bsed.getPolyTFrequencyLastBase();
           if (lastUMIBase!=null)  {
           	// bounds check
           	double freqs [] = bsed.getPolyTFrequency();
           	if (lastUMIBase>freqs.length)
           		throw new IllegalArgumentException("Trying to override UMI last base position with ["+ lastUMIBase +"] but UMI length is [" + freqs.length +"]");
           	barcodeUMIBias=freqs[lastUMIBase-1];
           }

           umiBias.put(bsed.getCellBarcode(), barcodeUMIBias);

           // finalize object so it uses less memory.
           bsed.finalize();
           // only add to the collection if you have UMIs and a repairable error.
           if (bsed.getUMICount()>=this.MIN_UMIS_PER_CELL && errorType==BeadSynthesisErrorType.SYNTH_MISSING_BASE)
           	errorBarcodesWithPositions.put(bsed.getCellBarcode(), bsed);

           // add to sorting collection if you have enough UMIs.
           sortingCollection.add(bsed);
       }

       PeekableIterator<BeadSynthesisErrorData> bsedIter = new PeekableIterator<>(sortingCollection.iterator());

       log.info("Writing Biased UMI reports");
       // write out the records from the sorting collection.
       writeFile(bsedIter, out);
       // write out the summary
       writeSummary(summary, outSummary);
       CloserUtil.close(bsedIter);

       // the error barcodes we want to fix.
       BiasedBarcodeCollection result = new BiasedBarcodeCollection(errorBarcodesWithPositions, umisPerCellBarcode, umiBias);
       return result;
}