Java Code Examples for htsjdk.samtools.SAMFileHeader#clone()

The following examples show how to use htsjdk.samtools.SAMFileHeader#clone() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BwaSparkEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * @param ctx           the Spark context
 * @param referenceFile the path to the reference file named <i>_prefix_.fa</i>, which is used to find the image file with name <i>_prefix_.fa.img</i>.
 *                      Can be <code>null</code> if the indexFileName is provided.
 * @param indexFileName the index image file name that already exists, or <code>null</code> to have the image file automatically distributed.
 * @param inputHeader   the SAM file header to use for reads
 * @param refDictionary the sequence dictionary to use for reads if the SAM file header doesn't have one (or it's empty)
 */
public BwaSparkEngine(final JavaSparkContext ctx,
                      final String referenceFile,
                      final String indexFileName,
                      SAMFileHeader inputHeader,
                      final SAMSequenceDictionary refDictionary) {
    Utils.nonNull(referenceFile);
    Utils.nonNull(inputHeader);
    this.ctx = ctx;
    if (indexFileName != null) {
        this.indexFileName = indexFileName;
        this.resolveIndexFileName = false;
    } else {
        String indexFile = referenceFile + REFERENCE_INDEX_IMAGE_FILE_SUFFIX;
        ctx.addFile(indexFile); // distribute index file to all executors
        this.indexFileName = IOUtils.getPath(indexFile).getFileName().toString();
        this.resolveIndexFileName = true;
    }

    if (inputHeader.getSequenceDictionary() == null || inputHeader.getSequenceDictionary().isEmpty()) {
        Utils.nonNull(refDictionary);
        inputHeader = inputHeader.clone();
        inputHeader.setSequenceDictionary(refDictionary);
    }
    broadcastHeader = ctx.broadcast(inputHeader);
}
 
Example 2
Source File: CreateIntervalsFiles.java    From Drop-seq with MIT License 5 votes vote down vote up
/**
 * Create a SAM header with a subset of the sequences
 * @param samHeader header to be subsetted
 * @param sequenceNames sequences to include
 * @return clone of the original header, but with only the named sequences
 */
private static SAMFileHeader createSubsetSamHeader(final SAMFileHeader samHeader, final List<String> sequenceNames) {
    final SAMFileHeader ret = samHeader.clone();
    final List<SAMSequenceRecord> sequenceRecords = new ArrayList<>(sequenceNames.size());
    for (final String sequence: sequenceNames) {
        final SAMSequenceRecord sequenceRecord = ret.getSequence(sequence);
        if (sequenceRecord == null)
throw new RuntimeException("Sequence '" + sequence + "' specified on command line but not found in sequence dictionary");
        sequenceRecords.add(sequenceRecord);
    }
    ret.getSequenceDictionary().setSequences(sequenceRecords);
    return ret;
}
 
Example 3
Source File: AnnotatedIntervalCodec.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Create an annotated interval header based on a config file (for locatable field names only) and a list of annotations (the rest of the fields).
 *
 * @param outputConfigFile config path for determining the locatable column headers in the output file.  If comma separated lists are present as values, the first entry will be chosen.
 *                         If the config file contains numeric indexes as output column name, then an exception is thrown.  Never {@code null}.
 * @param annotations  Names of the annotations to render.  If any of the locatable columns are in the annotation, those columns will be removed from the annotations list in the header.
 *                     Never {@code null}.
 * @param samFileHeader SAM FileHeader to prepend to the data.  {@code null} is allowed.
 * @return a header that can be used in an AnnotatedFileWriter.  Structured comments will be updated here.  Never {@code null}.
 */
public static AnnotatedIntervalHeader createHeaderForWriter(final Path outputConfigFile, final List<String> annotations, final SAMFileHeader samFileHeader) {

    Utils.nonNull(annotations);
    Utils.nonNull(outputConfigFile);

    final Pair<Boolean, Properties> validityAndPropertiesPair = XsvLocatableTableCodec.getAndValidateConfigFileContentsOnPath(outputConfigFile, true);
    final boolean                   isValid                   = validityAndPropertiesPair.getLeft();
    final Properties                headerNameProperties      = validityAndPropertiesPair.getRight();
    if ( !isValid ) {
        throw new UserException.BadInput("Error: invalid configuration file given: " + outputConfigFile.toUri().toString());
    }
    final String                    contigColumnName          = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_CONTIG_COLUMN));
    final String                    startColumnName           = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_START_COLUMN));
    final String                    endColumnName             = determineOutputColumnFromList(headerNameProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_END_COLUMN));

    XsvLocatableTableCodec.validateLocatableColumnName(contigColumnName);
    XsvLocatableTableCodec.validateLocatableColumnName(startColumnName);
    XsvLocatableTableCodec.validateLocatableColumnName(endColumnName);

    final List<String> finalAnnotations = annotations.stream()
            .filter(a -> !a.equals(contigColumnName) && !a.equals(startColumnName) && !a.equals(endColumnName))
            .collect(Collectors.toList());

    //TODO: Test NIO?  See https://github.com/broadinstitute/gatk/issues/4579
    final List<String> commentsToWrite = AnnotatedIntervalCodec.updateStructuredComments(contigColumnName, startColumnName, endColumnName, samFileHeader.getComments());

    // A bit more manual to write the SAM Header
    final SAMFileHeader finalSamHeader = samFileHeader.clone();
    finalSamHeader.setComments(commentsToWrite);

    return new AnnotatedIntervalHeader(contigColumnName, startColumnName, endColumnName, finalAnnotations, finalSamHeader);
}
 
Example 4
Source File: AddOrReplaceReadGroups.java    From picard with MIT License 4 votes vote down vote up
protected int doWork() {
    IOUtil.assertInputIsValid(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    final SamReader in = SamReaderFactory.makeDefault()
        .referenceSequence(REFERENCE_SEQUENCE)
        .open(SamInputResource.of(INPUT));

    // create the read-group we'll be using
    final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID);
    rg.setLibrary(RGLB);
    rg.setPlatform(RGPL);
    rg.setSample(RGSM);
    rg.setPlatformUnit(RGPU);
    if (RGCN != null) rg.setSequencingCenter(RGCN);
    if (RGDS != null) rg.setDescription(RGDS);
    if (RGDT != null) rg.setRunDate(RGDT);
    if (RGPI != null) rg.setPredictedMedianInsertSize(RGPI);
    if (RGPG != null) rg.setProgramGroup(RGPG);
    if (RGPM != null) rg.setPlatformModel(RGPM);
    if (RGKS != null) rg.setKeySequence(RGKS);
    if (RGFO != null) rg.setFlowOrder(RGFO);

    log.info(String.format("Created read-group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample()));

    // create the new header and output file
    final SAMFileHeader inHeader = in.getFileHeader();
    final SAMFileHeader outHeader = inHeader.clone();
    outHeader.setReadGroups(Collections.singletonList(rg));
    if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER);

    final SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader,
            outHeader.getSortOrder() == inHeader.getSortOrder(),
            OUTPUT);

    final ProgressLogger progress = new ProgressLogger(log);
    for (final SAMRecord read : in) {
        read.setAttribute(SAMTag.RG.name(), RGID);
        outWriter.addAlignment(read);
        progress.record(read);
    }

    // cleanup
    CloserUtil.close(in);
    outWriter.close();
    return 0;
}
 
Example 5
Source File: MarkDuplicatesSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Main method for marking duplicates, takes an JavaRDD of GATKRead and an associated SAMFileHeader with corresponding
 * sorting information and returns a new JavaRDD\<GATKRead\> in which all read templates have been marked as duplicates
 *
 * NOTE: This method expects the incoming reads to be grouped by read name (queryname sorted/querygrouped) and for this
 *       to be explicitly be set in the the provided header. Furthermore, all the reads in a template must be grouped
 *       into the same partition or there may be problems duplicate marking.
 *       If MarkDuplicates detects reads are sorted in some other way, it will perform an extra sort operation first,
 *       thus it is preferable to input reads to this method sorted for performance reasons.
 *
 * @param reads input reads to be duplicate marked
 * @param header header corresponding to the input reads
 * @param scoringStrategy method by which duplicates are detected
 * @param opticalDuplicateFinder
 * @param numReducers number of partitions to separate the data into
 * @param dontMarkUnmappedMates when true, unmapped mates of duplicate fragments will be marked as non-duplicates
 * @param taggingPolicy determines whether optical duplicates and library duplicates are labeled with the "DT" tag
 * @return A JavaRDD of GATKReads where duplicate flags have been set
 */
public static JavaRDD<GATKRead> mark(final JavaRDD<GATKRead> reads, final SAMFileHeader header,
                                     final MarkDuplicatesScoringStrategy scoringStrategy,
                                     final OpticalDuplicateFinder opticalDuplicateFinder,
                                     final int numReducers, final boolean dontMarkUnmappedMates,
                                     final MarkDuplicates.DuplicateTaggingPolicy taggingPolicy) {
    final boolean markUnmappedMates = !dontMarkUnmappedMates;
    SAMFileHeader headerForTool = header.clone();

    // If the input isn't queryname sorted, sort it before duplicate marking
    final JavaRDD<GATKRead> sortedReadsForMarking = SparkUtils.querynameSortReadsIfNecessary(reads, numReducers, headerForTool);

    // If we need to remove optical duplicates or tag them, then make sure we are keeping track
    final boolean markOpticalDups = (taggingPolicy != MarkDuplicates.DuplicateTaggingPolicy.DontTag);

    final JavaPairRDD<MarkDuplicatesSparkUtils.IndexPair<String>, Integer> namesOfNonDuplicates = MarkDuplicatesSparkUtils.transformToDuplicateNames(headerForTool, scoringStrategy, opticalDuplicateFinder, sortedReadsForMarking, numReducers, markOpticalDups);

    // Here we explicitly repartition the read names of the unmarked reads to match the partitioning of the original bam
    final JavaRDD<Tuple2<String,Integer>> repartitionedReadNames = namesOfNonDuplicates
            .mapToPair(pair -> new Tuple2<>(pair._1.getIndex(), new Tuple2<>(pair._1.getValue(),pair._2)))
            .partitionBy(new KnownIndexPartitioner(sortedReadsForMarking.getNumPartitions()))
            .values();

    // Here we combine the original bam with the repartitioned unmarked readnames to produce our marked reads
    return sortedReadsForMarking.zipPartitions(repartitionedReadNames, (readsIter, readNamesIter)  -> {
        final Map<String,Integer> namesOfNonDuplicateReadsAndOpticalCounts = new HashMap<>();
        readNamesIter.forEachRemaining(tup -> { if (namesOfNonDuplicateReadsAndOpticalCounts.putIfAbsent(tup._1,tup._2)!=null) {
            throw new GATKException(String.format("Detected multiple mark duplicate records objects corresponding to read with name '%s', this could be the result of the file sort order being incorrect or that a previous tool has let readnames span multiple partitions",tup._1()));
        }
            });

        return Utils.stream(readsIter)
                .peek(read -> read.setIsDuplicate(false))
                .peek(read -> read.setAttribute(MarkDuplicates.DUPLICATE_TYPE_TAG, (String) null))
                .peek(read -> {
                    // Handle reads that have been marked as non-duplicates (which also get tagged with optical duplicate summary statistics)
                    if (namesOfNonDuplicateReadsAndOpticalCounts.containsKey(read.getName())) {
                        // If its an optical duplicate, mark it. (Note: we only expect these to exist if optical duplicate marking is on)
                        if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == OPTICAL_DUPLICATE_MARKER) {
                            read.setIsDuplicate(true);
                            read.setAttribute(MarkDuplicates.DUPLICATE_TYPE_TAG, MarkDuplicates.DUPLICATE_TYPE_SEQUENCING);

                        // Otherwise treat it normally as a non-duplicate.
                        } else {
                            read.setIsDuplicate(false);
                            if (markUnmappedMates || !read.isUnmapped()) {
                                int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), NO_OPTICAL_MARKER);
                                if (dupCount > -1) {
                                    read.setTransientAttribute(MarkDuplicatesSparkUtils.OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME, dupCount);
                                }
                            }
                        }
                        // Mark unmapped read pairs as non-duplicates
                    } else if (ReadUtils.readAndMateAreUnmapped(read)) {
                        read.setIsDuplicate(false);
                        // Everything else is a duplicate
                    } else {
                        if (markUnmappedMates || !read.isUnmapped()) {
                            read.setIsDuplicate(true);
                            if (taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.All) {
                                read.setAttribute(MarkDuplicates.DUPLICATE_TYPE_TAG, MarkDuplicates.DUPLICATE_TYPE_LIBRARY);
                            }
                        } else {
                            read.setIsDuplicate(false);
                        }
                    }
                }).iterator();
    });
}
 
Example 6
Source File: ReadsPipelineSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
    List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

    final JavaRDD<GATKRead> alignedReads;
    final SAMFileHeader header;
    final BwaSparkEngine bwaEngine;
    if (align) {
        bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary());
        if (bwaArgs.singleEndAlignment) {
            alignedReads = bwaEngine.alignUnpaired(getReads());
        } else {
            // filter reads after alignment in the case of paired reads since filtering does not know about pairs
            final ReadFilter filter = makeReadFilter(bwaEngine.getHeader());
            alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test);
        }
        header = bwaEngine.getHeader();
    } else {
        bwaEngine = null;
        alignedReads = getReads();
        header = getHeaderForReads();
    }

    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

    // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource
    final SAMFileHeader readsHeader = header.clone();
    readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    final JavaRDD<GATKRead> sortedMarkedReads = SparkUtils.sortReadsAccordingToHeader(markedReads, readsHeader, numReducers);

    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), header);

    JavaRDD<GATKRead> markedFilteredReadsForBQSR = sortedMarkedReads.filter(bqsrReadFilter::test);

    JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(markedFilteredReadsForBQSR, localKnownSitesFilePaths);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(sortedMarkedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

    if (outputBam != null) { // only write output of BQSR if output BAM is specified
        writeReads(ctx, outputBam, finalReads, header, true);
    }

    // Run Haplotype Caller
    final ReadFilter hcReadFilter = ReadFilter.fromList(HaplotypeCallerEngine.makeStandardHCReadFilters(), header);
    final JavaRDD<GATKRead> filteredReadsForHC = finalReads.filter(hcReadFilter::test);
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    final List<SimpleInterval> intervals = hasUserSuppliedIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);

    List<ShardBoundary> intervalShards = intervals.stream()
            .flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardingArgs.readShardSize, shardingArgs.readShardPadding, sequenceDictionary).stream())
            .collect(Collectors.toList());

    HaplotypeCallerSpark.callVariantsWithHaplotypeCallerAndWriteOutput(ctx, filteredReadsForHC, readsHeader, sequenceDictionary, referenceArguments.getReferenceFileName(), intervalShards, hcArgs, shardingArgs, assemblyRegionArgs, output, makeVariantAnnotations(), logger, strict, createOutputVariantIndex);

    if (bwaEngine != null) {
        bwaEngine.close();
    }
}