htsjdk.samtools.util.SortingCollection Java Examples

The following examples show how to use htsjdk.samtools.util.SortingCollection. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CollapseTagWithContext.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * If the number of records exceeds the number of records allowed in memory, spill to disk.
 * @param groupingIter
 * @param writer
 * @param outMetrics
 * @param header
 */
private void lowMemoryIteration (PeekableGroupingIterator<SAMRecord> groupingIter,									 
								 SAMFileWriter writer, PrintStream outMetrics, SAMFileHeader header) {
	log.info("Running (slower) memory efficient mode");				
       while (groupingIter.hasNext()) {
       	// for this group, get a SortingCollection.  Note that this is not used for sorting.  It is merely
		// an unsorted collection if there might be more objects than can fit in RAM.
       	SortingCollection<SAMRecord> sortingCollection = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), NO_OP_COMPARATOR, this.MAX_RECORDS_IN_RAM);

       	// you have to grab the next element, in case it's the first of the group but not the first group!
       	sortingCollection.add(groupingIter.next()); 
       	
       	// spool the reads for a whole group into the sorting collection to operate on - the code uses a multi-pass approach so we can't just iterate over the grouping iterator.
       	while (groupingIter.hasNextInGroup())         		
       		sortingCollection.add(groupingIter.next());
       	
       	// wrap up the sorting collection for adding records.
       	sortingCollection.doneAdding();
       	sortingCollection.setDestructiveIteration(false);
       	
       	processContext(sortingCollection, writer, false, outMetrics);        	
       }	
}
 
Example #2
Source File: SortVcf.java    From picard with MIT License 6 votes vote down vote up
/**
 * Merge the inputs and sort them by adding each input's content to a single SortingCollection.
 * <p/>
 * NB: It would be better to have a merging iterator as in MergeSamFiles, as this would perform better for pre-sorted inputs.
 * Here, we are assuming inputs are unsorted, and so adding their VariantContexts iteratively is fine for now.
 * MergeVcfs exists for simple merging of presorted inputs.
 *
 * @param readers      - a list of VCFFileReaders, one for each input VCF
 * @param outputHeader - The merged header whose information we intend to use in the final output file
 */
private SortingCollection<VariantContext> sortInputs(final List<VCFFileReader> readers, final VCFHeader outputHeader) {
    final ProgressLogger readProgress = new ProgressLogger(log, 25000, "read", "records");

    // NB: The default MAX_RECORDS_IN_RAM may not be appropriate here. VariantContexts are smaller than SamRecords
    // We would have to play around empirically to find an appropriate value. We are not performing this optimization at this time.
    final SortingCollection<VariantContext> sorter =
            SortingCollection.newInstance(
                    VariantContext.class,
                    new VCFRecordCodec(outputHeader, VALIDATION_STRINGENCY != ValidationStringency.STRICT),
                    outputHeader.getVCFRecordComparator(),
                    MAX_RECORDS_IN_RAM,
                    TMP_DIR);
    int readerCount = 1;
    for (final VCFFileReader reader : readers) {
        log.info("Reading entries from input file " + readerCount);
        for (final VariantContext variantContext : reader) {
            sorter.add(variantContext);
            readProgress.record(variantContext.getContig(), variantContext.getStart());
        }
        reader.close();
        readerCount++;
    }
    return sorter;
}
 
Example #3
Source File: GtcToVcf.java    From picard with MIT License 6 votes vote down vote up
private void fillContexts(final SortingCollection<VariantContext> contexts, final InfiniumGTCFile gtcFile,
                          final Build37ExtendedIlluminaManifest manifest, final InfiniumEGTFile egtFile) {
    final ProgressLogger progressLogger = new ProgressLogger(log, 100000, "sorted");

    final Iterator<Build37ExtendedIlluminaManifestRecord> iterator = manifest.extendedIterator();
    int gtcIndex = 0;

    int numVariantsWritten = 0;

    while (iterator.hasNext()) {
        final Build37ExtendedIlluminaManifestRecord record = iterator.next();

        if (!record.isBad()) {
            InfiniumGTCRecord gtcRecord = gtcFile.getRecord(gtcIndex);
            VariantContext context = makeVariantContext(record, gtcRecord, egtFile, progressLogger);
            numVariantsWritten++;
            contexts.add(context);
        }
        gtcIndex++;
    }

    log.info(numVariantsWritten + " Variants were written to file");
    log.info(gtcFile.getNumberOfSnps() + " SNPs in the GTC file");
    log.info(manifest.getNumAssays() + " Variants on the " + manifest.getDescriptorFileName() + " genotyping array manifest file");
}
 
Example #4
Source File: GtcToVcf.java    From picard with MIT License 6 votes vote down vote up
/**
 * Writes out the VariantContext objects in the order presented to the supplied output file
 * in VCF format.
 */
private void writeVcf(final SortingCollection<VariantContext> variants,
                      final File output,
                      final SAMSequenceDictionary dict,
                      final VCFHeader vcfHeader) {

    try (final VariantContextWriter writer = new VariantContextWriterBuilder()
            .setOutputFile(output)
            .setReferenceDictionary(dict)
            .setOptions(VariantContextWriterBuilder.DEFAULT_OPTIONS)
            .build()) {

        writer.writeHeader(vcfHeader);

        for (final VariantContext variant : variants) {
            if (variant.getAlternateAlleles().size() > 1) {
                variant.getCommonInfo().addFilter(InfiniumVcfFields.TRIALLELIC);
            }

            writer.add(variant);
        }
    }
}
 
Example #5
Source File: NewIlluminaBasecallsConverter.java    From picard with MIT License 6 votes vote down vote up
private synchronized void addRecord(final String barcode, final CLUSTER_OUTPUT_RECORD record) {
    // Grab the existing collection, or initialize it if it doesn't yet exist
    SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
    if (recordCollection == null) {
        // TODO: The implementation here for supporting ignoreUnexpectedBarcodes is not efficient,
        // but the alternative is an extensive rewrite.  We are living with the inefficiency for
        // this special case for the time being.
        if (!barcodeRecordWriterMap.containsKey(barcode)) {
            if (ignoreUnexpectedBarcodes) {
                return;
            }
            throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected.  (Is it referenced in the parameters file?)", barcode));
        }
        recordCollection = newSortingCollection();
        this.barcodeToRecordCollection.put(barcode, recordCollection);
    }
    recordCollection.add(record);
}
 
Example #6
Source File: IlluminaBasecallsConverter.java    From picard with MIT License 6 votes vote down vote up
/**
 * @param basecallsDir             Where to read basecalls from.
 * @param lane                     What lane to process.
 * @param readStructure            How to interpret each cluster.
 * @param barcodeRecordWriterMap   Map from barcode to CLUSTER_OUTPUT_RECORD writer.  If demultiplex is false, must contain
 *                                 one writer stored with key=null.
 * @param demultiplex              If true, output is split by barcode, otherwise all are written to the same output stream.
 * @param maxReadsInRamPerTile     Configures number of reads each tile will store in RAM before spilling to disk.
 * @param tmpDirs                  For SortingCollection spilling.
 * @param numProcessors            Controls number of threads.  If <= 0, the number of threads allocated is
 *                                 available cores - numProcessors.
 * @param forceGc                  Force explicit GC periodically.  This is good for causing memory maps to be released.
 * @param firstTile                (For debugging) If non-null, start processing at this tile.
 * @param tileLimit                (For debugging) If non-null, process no more than this many tiles.
 * @param outputRecordComparator   For sorting output records within a single tile.
 * @param codecPrototype           For spilling output records to disk.
 * @param outputRecordClass        Inconveniently needed to create SortingCollections.
 * @param includeNonPfReads        If true, will include ALL reads (including those which do not have PF set)
 * @param ignoreUnexpectedBarcodes If true, will ignore reads whose called barcode is not found in barcodeRecordWriterMap,
 *                                 otherwise will throw an exception
 */
public IlluminaBasecallsConverter(final File basecallsDir, final int lane, final ReadStructure readStructure,
                                  final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
                                  final boolean demultiplex,
                                  final int maxReadsInRamPerTile,
                                  final List<File> tmpDirs,
                                  final int numProcessors, final boolean forceGc,
                                  final Integer firstTile, final Integer tileLimit,
                                  final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
                                  final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
                                  final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
                                  final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
                                  final boolean applyEamssFiltering,
                                  final boolean includeNonPfReads,
                                  final boolean ignoreUnexpectedBarcodes
) {
    this(basecallsDir, null, lane, readStructure,
            barcodeRecordWriterMap, demultiplex, maxReadsInRamPerTile,
            tmpDirs, numProcessors, forceGc, firstTile, tileLimit,
            outputRecordComparator, codecPrototype, outputRecordClass,
            bclQualityEvaluationStrategy, applyEamssFiltering,
            includeNonPfReads, ignoreUnexpectedBarcodes);
}
 
Example #7
Source File: IlluminaBasecallsConverter.java    From picard with MIT License 6 votes vote down vote up
/**
 * Adds the provided record to this tile.
 */
public synchronized void addRecord(final String barcode, final CLUSTER_OUTPUT_RECORD record) {
    this.recordCount += 1;

    // Grab the existing collection, or initialize it if it doesn't yet exist
    SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
    if (recordCollection == null) {
        // TODO: The implementation here for supporting ignoreUnexpectedBarcodes is not efficient,
        // but the alternative is an extensive rewrite.  We are living with the inefficiency for
        // this special case for the time being.
        if (!barcodeRecordWriterMap.containsKey(barcode)) {
            if (ignoreUnexpectedBarcodes) {
                return;
            }
            throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected.  (Is it referenced in the parameters file?)", barcode));
        }
        recordCollection = this.newSortingCollection();
        this.barcodeToRecordCollection.put(barcode, recordCollection);
        this.barcodeToProcessingState.put(barcode, null);
    }
    recordCollection.add(record);
}
 
Example #8
Source File: RevertSam.java    From picard with MIT License 6 votes vote down vote up
RevertSamSorter(
        final boolean outputByReadGroup,
        final Map<String, SAMFileHeader> headerMap,
        final SAMFileHeader singleOutHeader,
        final int maxRecordsInRam) {

    this.outputByReadGroup = outputByReadGroup;
    if (outputByReadGroup) {
        for (final Map.Entry<String, SAMFileHeader> entry : headerMap.entrySet()) {
            final String readGroupId = entry.getKey();
            final SAMFileHeader outHeader = entry.getValue();
            final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), maxRecordsInRam);
            sorterMap.put(readGroupId, sorter);
        }
        singleSorter = null;
    } else {
        singleSorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(singleOutHeader), new SAMRecordQueryNameComparator(), maxRecordsInRam);
    }
}
 
Example #9
Source File: SortingIteratorFactory.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 *
 * @param componentType Required because of Java generic syntax limitations.
 * @param underlyingIterator All records are pulled from this iterator, which is then closed if closeable.
 * @param comparator Defines sort order.
 * @param codec For spilling to temp files
 * @param maxRecordsInRam
 * @param progressLogger Pass null if not interested in logging.
 * @return An iterator in the order defined by comparator, that will produce all the records from underlyingIterator.
 */
public static <T> CloseableIterator<T> create(final Class<T> componentType,
                                              final Iterator<T> underlyingIterator,
                                              final Comparator<T> comparator,
                                              final SortingCollection.Codec<T> codec,
                                              final int maxRecordsInRam,
                                              final ProgressCallback progressLogger) {

    SortingCollection<T> sortingCollection =
            SortingCollection.newInstance(componentType, codec, comparator, maxRecordsInRam);

    while (underlyingIterator.hasNext()) {
        final T rec = underlyingIterator.next();
        if (progressLogger != null)
progressLogger.logProgress(rec);
        sortingCollection.add(rec);
    }
    CloseableIterator<T> ret = sortingCollection.iterator();
    CloserUtil.close(underlyingIterator);
    return ret;
}
 
Example #10
Source File: BasecallsConverter.java    From picard with MIT License 5 votes vote down vote up
/**
 * @param barcodeRecordWriterMap   Map from barcode to CLUSTER_OUTPUT_RECORD writer.  If demultiplex is false, must contain
 *                                 one writer stored with key=null.
 * @param demultiplex              If true, output is split by barcode, otherwise all are written to the same output stream.
 * @param maxReadsInRamPerTile     Configures number of reads each tile will store in RAM before spilling to disk.
 * @param tmpDirs                  For SortingCollection spilling.
 * @param numProcessors            Controls number of threads.  If <= 0, the number of threads allocated is
 *                                 available cores - numProcessors.
 * @param outputRecordComparator   For sorting output records within a single tile.
 * @param codecPrototype           For spilling output records to disk.
 * @param outputRecordClass        Inconveniently needed to create SortingCollections.
 * @param ignoreUnexpectedBarcodes If true, will ignore reads whose called barcode is not found in barcodeRecordWriterMap,
 */
BasecallsConverter(final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
                   final int maxReadsInRamPerTile,
                   final List<File> tmpDirs,
                   final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
                   final boolean ignoreUnexpectedBarcodes,
                   final boolean demultiplex,
                   final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
                   final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
                   final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
                   final int numProcessors,
                   final IlluminaDataProviderFactory factory) {

    this.barcodeRecordWriterMap = barcodeRecordWriterMap;
    this.maxReadsInRamPerTile = maxReadsInRamPerTile;
    this.tmpDirs = tmpDirs;
    this.codecPrototype = codecPrototype;
    this.ignoreUnexpectedBarcodes = ignoreUnexpectedBarcodes;
    this.demultiplex = demultiplex;
    this.outputRecordComparator = outputRecordComparator;
    this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
    this.outputRecordClass = outputRecordClass;
    this.factory = factory;


    if (numProcessors == 0) {
        this.numThreads = Runtime.getRuntime().availableProcessors();
    } else if (numProcessors < 0) {
        this.numThreads = Runtime.getRuntime().availableProcessors() + numProcessors;
    } else {
        this.numThreads = numProcessors;
    }
}
 
Example #11
Source File: CreateSequenceDictionary.java    From varsim with BSD 2-Clause "Simplified" License 5 votes vote down vote up
private SortingCollection<String> makeSortingCollection() {
  final String name = getClass().getSimpleName();
  final File tmpDir = IOUtil.createTempDir(name, null);
  tmpDir.deleteOnExit();
  // 256 byte for one name, and 1/10 part of all memory for this, rough estimate
  long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10;
  return SortingCollection.newInstance(
          String.class,
          new StringCodec(),
          String::compareTo,
          (int) Math.min(maxNamesInRam, Integer.MAX_VALUE),
          tmpDir
  );
}
 
Example #12
Source File: SequenceDictionaryUtils.java    From picard with MIT License 5 votes vote down vote up
public static SortingCollection<String> makeSortingCollection() {
    final File tmpDir = IOUtil.createTempDir("SamDictionaryNames", null);
    tmpDir.deleteOnExit();
    // 256 byte for one name, and 1/10 part of all memory for this, rough estimate
    long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10;
    return SortingCollection.newInstance(
            String.class,
            new StringCodec(),
            String::compareTo,
            (int) Math.min(maxNamesInRam, Integer.MAX_VALUE),
            tmpDir.toPath()
    );
}
 
Example #13
Source File: SortVcf.java    From picard with MIT License 5 votes vote down vote up
private void writeSortedOutput(final VCFHeader outputHeader, final SortingCollection<VariantContext> sortedOutput) {
    final ProgressLogger writeProgress = new ProgressLogger(log, 25000, "wrote", "records");
    final EnumSet<Options> options = CREATE_INDEX ? EnumSet.of(Options.INDEX_ON_THE_FLY) : EnumSet.noneOf(Options.class);
    final VariantContextWriter out = new VariantContextWriterBuilder().
            setReferenceDictionary(outputHeader.getSequenceDictionary()).
            setOptions(options).
            setOutputFile(OUTPUT).build();
    out.writeHeader(outputHeader);
    for (final VariantContext variantContext : sortedOutput) {
        out.add(variantContext);
        writeProgress.record(variantContext.getContig(), variantContext.getStart());
    }
    out.close();
}
 
Example #14
Source File: SortVcf.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    final List<String> sampleList = new ArrayList<String>();

    for (final File input : INPUT) IOUtil.assertFileIsReadable(input);

    if (SEQUENCE_DICTIONARY != null) IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);

    SAMSequenceDictionary samSequenceDictionary = null;
    if (SEQUENCE_DICTIONARY != null) {
        samSequenceDictionary = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY).getSequenceDictionary();
        CloserUtil.close(SEQUENCE_DICTIONARY);
    }

    // Gather up a file reader and file header for each input file. Check for sequence dictionary compatibility along the way.
    collectFileReadersAndHeaders(sampleList, samSequenceDictionary);

    // Create the merged output header from the input headers
    final VCFHeader outputHeader = new VCFHeader(VCFUtils.smartMergeHeaders(inputHeaders, false), sampleList);

    // Load entries into the sorting collection
    final SortingCollection<VariantContext> sortedOutput = sortInputs(inputReaders, outputHeader);

    // Output to the final file
    writeSortedOutput(outputHeader, sortedOutput);

    return 0;
}
 
Example #15
Source File: SamComparison.java    From picard with MIT License 5 votes vote down vote up
private void setupLenientDuplicateChecking() {
    /* Setup for duplicate marking checks.  Recs which disagree on duplicate marking will be added to markDuplicatesCheckLeft/Right and then fed to a DuplicateSetIterator to check if
     * differences are due only to choice of representative read within duplicate set.
     */
    final SAMFileHeader leftHeader = leftReader.getFileHeader();
    final SAMFileHeader rightHeader = rightReader.getFileHeader();
    final SAMRecordDuplicateComparator leftDupComparator = new SAMRecordDuplicateComparator(Collections.singletonList(leftHeader));
    final SAMRecordDuplicateComparator rightDupComparator = new SAMRecordDuplicateComparator(Collections.singletonList(rightHeader));
    markDuplicatesCheckLeft = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(leftHeader),
            leftDupComparator, SAMFileWriterImpl.getDefaultMaxRecordsInRam());
    markDuplicatesCheckRight = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(rightHeader),
            rightDupComparator, SAMFileWriterImpl.getDefaultMaxRecordsInRam());
}
 
Example #16
Source File: RevertSam.java    From picard with MIT License 5 votes vote down vote up
void add(final SAMRecord rec) {
    final SortingCollection<SAMRecord> sorter;
    if (outputByReadGroup) {
        sorter = sorterMap.get(rec.getReadGroup().getId());
    } else {
        sorter = singleSorter;
    }
    sorter.add(rec);
}
 
Example #17
Source File: IlluminaBasecallsConverter.java    From picard with MIT License 5 votes vote down vote up
private synchronized SortingCollection<CLUSTER_OUTPUT_RECORD> newSortingCollection() {
    final int maxRecordsInRam =
            Math.max(1, maxReadsInRamPerTile /
                    barcodeRecordWriterMap.size());
    return SortingCollection.newInstance(
            outputRecordClass,
            codecPrototype.clone(),
            outputRecordComparator,
            maxRecordsInRam,
            tmpDirs);
}
 
Example #18
Source File: GtcToVcf.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    Sex fingerprintSex = getFingerprintSex(FINGERPRINT_GENOTYPES_VCF_FILE);
    String gtcGender = getGenderFromGtcFile(GENDER_GTC, ILLUMINA_BEAD_POOL_MANIFEST_FILE);
    try (InfiniumGTCFile infiniumGTCFile = new InfiniumGTCFile(INPUT, ILLUMINA_BEAD_POOL_MANIFEST_FILE);
         InfiniumEGTFile infiniumEGTFile = new InfiniumEGTFile(CLUSTER_FILE)) {
        final Build37ExtendedIlluminaManifest manifest = setupAndGetManifest(infiniumGTCFile);

        final VCFHeader vcfHeader = createVCFHeader(manifest, infiniumGTCFile, gtcGender, fingerprintSex, CLUSTER_FILE,
                REFERENCE_SEQUENCE, refSeq.getSequenceDictionary());

        // Setup a collection that will sort contexts properly
        // Necessary because input GTC file is not sorted
        final SortingCollection<VariantContext> contexts =
                SortingCollection.newInstance(
                        VariantContext.class,
                        new VCFRecordCodec(vcfHeader),
                        new VariantContextComparator(refSeq.getSequenceDictionary()),
                        MAX_RECORDS_IN_RAM,
                        TMP_DIR.stream().map(File::toPath).toArray(Path[]::new));

        // fill the sorting collection
        fillContexts(contexts, infiniumGTCFile, manifest, infiniumEGTFile);

        writeVcf(contexts, OUTPUT, refSeq.getSequenceDictionary(), vcfHeader);

        return 0;
    } catch (IOException e) {
        throw new PicardException("Error processing GTC File: " + INPUT.getAbsolutePath(), e);
    }
}
 
Example #19
Source File: NewIlluminaBasecallsConverter.java    From picard with MIT License 5 votes vote down vote up
private synchronized SortingCollection<CLUSTER_OUTPUT_RECORD> newSortingCollection() {
    final int maxRecordsInRam =
            Math.max(1, maxReadsInRamPerTile /
                    barcodeRecordWriterMap.size());
    return SortingCollection.newInstanceFromPaths(
            outputRecordClass,
            codecPrototype.clone(),
            outputRecordComparator,
            maxRecordsInRam,
            IOUtil.filesToPaths(tmpDirs));
}
 
Example #20
Source File: EstimateLibraryComplexity.java    From picard with MIT License 4 votes vote down vote up
@Override
public SortingCollection.Codec<PairedReadSequence> clone() { return new PairedReadWithBarcodesCodec(); }
 
Example #21
Source File: SortingCollectionSink.java    From Drop-seq with MIT License 4 votes vote down vote up
public SortingCollectionSink(SortingCollection<T> collection) {
    this.collection = collection;
}
 
Example #22
Source File: DetectBeadSynthesisErrors.java    From Drop-seq with MIT License 4 votes vote down vote up
/**
 * Find all the cell barcodes that are biased.
 *
 * This walks through many (perhaps all?) of the cell barcodes, and for cell barcodes that have a sufficient number of UMIs, looks to see if there's an error
 * @param iter The BAM iterator to walk through
 * @param out the verbose output stream.
 * @param outSummary The summary output stream.
 *
 * TODO: A less memory-hog version of this would write out the summary file as it runs.  Could even write this out to a SortingIteratorFactory by implementing a codec...
 * Only need to hang onto errors that are SYNTH_MISSING_BASE, and leave the rest null (and check for that when running barcode repair.)
 *
 * @return A collection of biased cell barcodes.
 */
private BiasedBarcodeCollection findBiasedBarcodes (final UMIIterator iter, final PrintStream out, final File outSummary, final Integer lastUMIBase) {
	log.info("Finding Cell Barcodes with UMI errors");
	// Group the stream of UMICollections into groups with the same cell barcode.
       GroupingIterator<UMICollection> groupingIterator = new GroupingIterator<>(iter,
               new Comparator<UMICollection>() {
                   @Override
                   public int compare(final UMICollection o1, final UMICollection o2) {
                       return o1.getCellBarcode().compareTo(o2.getCellBarcode());
                   }
               });


	// for holding barcodes results.  The key is the cell barcode, the value is the first base to pad.
	// Used for cleanup of BAMs.
	Map<String, BeadSynthesisErrorData> errorBarcodesWithPositions = new HashMap<>();

		// for holding UMI Strings efficiently
	// TODO: evaluate if this is needed this anymore now that the report is written out disk immediately.
	StringInterner  umiStringCache = new StringInterner();

	// a sorting collection so big data can spill to disk before it's sorted and written out as a report.
	SortingCollection<BeadSynthesisErrorData> sortingCollection= SortingCollection.newInstance(BeadSynthesisErrorData.class, new BeadSynthesisErrorDataCodec(), new BeadSynthesisErrorData.SizeComparator(), this.MAX_BARCODE_ERRORS_IN_RAM);

       // gather up summary stats
    	BeadSynthesisErrorsSummaryMetric summary = new BeadSynthesisErrorsSummaryMetric();

    	// track the list of cell barcodes with sufficient UMIs to process.  We'll need them later to find intended sequences.
    	ObjectCounter<String> umisPerCellBarcode = new ObjectCounter<>();

    	// track the UMI Bias at the last base
    	Map<String, Double> umiBias = new HashMap<>();

    	// a log for processing
    	ProgressLogger prog = new ProgressLogger(log, 1000000, "Processed Cell/Gene UMIs");

    	// main data generation loop.
    	// to ease memory usage, after generating the BeadSynthesisErrorData object, use its cell barcode string for registering additional data.
       for (final List<UMICollection> umiCollectionList : groupingIterator) {
           BeadSynthesisErrorData bsed = buildBeadSynthesisErrorData(umiCollectionList, umiStringCache, prog);
           // if the cell has too few UMIs, then go to the next cell and skip all processing.
           if (bsed.getNumTranscripts() < this.MIN_UMIS_PER_CELL)
			// not sure I even want to track this...
           	// summary.LOW_UMI_COUNT++;
           	continue;

           // add the result to the summary
           summary=addDataToSummary(bsed, summary);
           umisPerCellBarcode.incrementByCount(bsed.getCellBarcode(), bsed.getNumTranscripts()); // track the cell barcode if it's sufficiently large to process.

           // explicitly call getting the error type.
           BeadSynthesisErrorType errorType=bsed.getErrorType(this.EXTREME_BASE_RATIO, this.detectPrimerTool, this.EDIT_DISTANCE);

           // gather up the UMI bias at the last base.  Note: this can be over-ridden by supplying a last base position.
           double barcodeUMIBias = bsed.getPolyTFrequencyLastBase();
           if (lastUMIBase!=null)  {
           	// bounds check
           	double freqs [] = bsed.getPolyTFrequency();
           	if (lastUMIBase>freqs.length)
           		throw new IllegalArgumentException("Trying to override UMI last base position with ["+ lastUMIBase +"] but UMI length is [" + freqs.length +"]");
           	barcodeUMIBias=freqs[lastUMIBase-1];
           }

           umiBias.put(bsed.getCellBarcode(), barcodeUMIBias);

           // finalize object so it uses less memory.
           bsed.finalize();
           // only add to the collection if you have UMIs and a repairable error.
           if (bsed.getUMICount()>=this.MIN_UMIS_PER_CELL && errorType==BeadSynthesisErrorType.SYNTH_MISSING_BASE)
           	errorBarcodesWithPositions.put(bsed.getCellBarcode(), bsed);

           // add to sorting collection if you have enough UMIs.
           sortingCollection.add(bsed);
       }

       PeekableIterator<BeadSynthesisErrorData> bsedIter = new PeekableIterator<>(sortingCollection.iterator());

       log.info("Writing Biased UMI reports");
       // write out the records from the sorting collection.
       writeFile(bsedIter, out);
       // write out the summary
       writeSummary(summary, outSummary);
       CloserUtil.close(bsedIter);

       // the error barcodes we want to fix.
       BiasedBarcodeCollection result = new BiasedBarcodeCollection(errorBarcodesWithPositions, umisPerCellBarcode, umiBias);
       return result;
}
 
Example #23
Source File: VcfLineSortingCollection.java    From imputationserver with GNU Affero General Public License v3.0 4 votes vote down vote up
public static SortingCollection<VcfLine> newInstance(int maxRecords, String tempDir) {
	return SortingCollection.newInstance(VcfLine.class, new VcfLineCodec(), new VcfLineComparator(), maxRecords,
			new File(tempDir));
}
 
Example #24
Source File: NewIlluminaBasecallsConverter.java    From picard with MIT License 4 votes vote down vote up
RecordWriter(final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer,
             final SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection, final String barcode) {
    this.writer = writer;
    this.recordCollection = recordCollection;
    this.barcode = barcode;
}
 
Example #25
Source File: RepresentativeReadIndexerCodec.java    From picard with MIT License 4 votes vote down vote up
public SortingCollection.Codec<RepresentativeReadIndexer> clone() {
    return new RepresentativeReadIndexerCodec();
}
 
Example #26
Source File: ReadEndsForMarkDuplicatesCodec.java    From picard with MIT License 4 votes vote down vote up
public SortingCollection.Codec<ReadEndsForMarkDuplicates> clone() {
    return new ReadEndsForMarkDuplicatesCodec();
}
 
Example #27
Source File: ReadEndsForMarkDuplicatesWithBarcodesCodec.java    From picard with MIT License 4 votes vote down vote up
@Override
public SortingCollection.Codec<ReadEndsForMarkDuplicates> clone() {
    return new ReadEndsForMarkDuplicatesWithBarcodesCodec();
}
 
Example #28
Source File: IlluminaBasecallsConverter.java    From picard with MIT License 4 votes vote down vote up
/**
         * Returns a PriorityRunnable that encapsulates the work involved with writing the provided tileRecord's data
         * for the given barcode to disk.
         *
         * @param tile       The tile from which the record was read
         * @param tileRecord The processing record associated with the tile
         * @param barcode    The barcode whose data within the tileRecord is to be written
         * @return The runnable that upon invocation writes the barcode's data from the tileRecord to disk
         */
        private PriorityRunnable newBarcodeWorkInstance(final Tile tile, final TileProcessingRecord tileRecord, final String barcode) {
            return new PriorityRunnable() {
                @Override
                public void run() {
                    try {
                        final SortingCollection<CLUSTER_OUTPUT_RECORD> records = tileRecord.getBarcodeRecords().get(barcode);
                        final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer = barcodeRecordWriterMap.get(barcode);

                        log.debug(String.format("Writing records from tile %s with barcode %s ...", tile.getNumber(), barcode));

                        final PeekIterator<CLUSTER_OUTPUT_RECORD> it = new PeekIterator<>(records.iterator());
                        while (it.hasNext()) {
                            final CLUSTER_OUTPUT_RECORD rec = it.next();

                            /*
                             * PIC-330 Sometimes there are two reads with the same cluster coordinates, and thus
                             * the same read name.  Discard both of them.  This code assumes that the two first of pairs
                             * will come before the two second of pairs, so it isn't necessary to look ahead a different
                             * distance for paired end.  It also assumes that for paired ends there will be duplicates
                             * for both ends, so there is no need to be PE-aware.
                             */
                            if (it.hasNext()) {
                                final CLUSTER_OUTPUT_RECORD lookAhead = it.peek();

/* TODO: Put this in SAMFileWriter wrapper
                                if (!rec.getReadUnmappedFlag() || !lookAhead.getReadUnmappedFlag()) {
                                    throw new IllegalStateException("Should not have mapped reads.");
                                }
*/

                                if (outputRecordComparator.compare(rec, lookAhead) == 0) {
                                    it.next();
                                    log.info("Skipping reads with identical read names: " + rec.toString());
                                    continue;
                                }
                            }

                            writer.write(rec);
                            writeProgressLogger.record(null, 0);
                        }

                        tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.WRITTEN);
                        findAndEnqueueWorkOrSignalCompletion();

                    } catch (final RuntimeException | Error e) {
                        /*
                         * In the event of an internal failure, signal to the parent thread that something has gone
                         * wrong.  This is necessary because if an item of work fails to complete, the aggregator will
                         * will never reach its completed state, and it will never terminate.
                         */
                        parentThread.interrupt();
                        throw e;
                    }
                }

            };
        }
 
Example #29
Source File: EstimateLibraryComplexity.java    From picard with MIT License 4 votes vote down vote up
@Override
public SortingCollection.Codec<PairedReadSequence> clone() { return new PairedReadCodec(); }
 
Example #30
Source File: EstimateLibraryComplexity.java    From picard with MIT License 4 votes vote down vote up
public static SortingCollection.Codec<PairedReadSequence> getCodec() {
    return new PairedReadCodec();
}