Java Code Examples for htsjdk.samtools.SAMRecord#getAlignmentBlocks()

The following examples show how to use htsjdk.samtools.SAMRecord#getAlignmentBlocks() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AnnotationUtils.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * Get the LocusFunction on a gene by gene basis for the alignment blocks of this read.
 * If a read is split into multiple blocks, each block should point to the same gene - if blocks point to
 * different genes (which can't happen, you can't splice different genes together), then that gene result should not be returned.
 * Instead, return null in those cases.
 * @param alignmentBlocks
 * @param g
 * @return
 */
private LocusFunction getLocusFunctionForRead (final SAMRecord rec, final Gene g) {
	List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();

       LocusFunction [] blockSummaryFunction = new LocusFunction[alignmentBlocks.size()];
       Set<Gene> temp = new HashSet<>();
       temp.add(g);

       for (int i=0; i<alignmentBlocks.size(); i++) {
       	AlignmentBlock alignmentBlock =alignmentBlocks.get(i);

       	LocusFunction [] blockFunctions=getLocusFunctionsByBlock(alignmentBlock, temp);
       	LocusFunction blockFunction = getLocusFunction(blockFunctions, false);
       	blockSummaryFunction[i]=blockFunction;
       }

       LocusFunction readFunction = getLocusFunction(blockSummaryFunction, false);
	return readFunction;
}
 
Example 2
Source File: AnnotationUtils.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * For each alignment block, see which genes have exons that intersect it.
 * Only count genes where the alignment blocks are consistent - either the alignment blocks point to the same gene, or
 * an alignment block points to a gene and the other to no genes/exons in the set.
 * Note that this is not strand specific, which may cause problems if a read overlaps two genes on opposite strands that have overlapping exons.
 * @param rec
 * @param genes A set of genes that the read originally overlaps.
 * @param allowMultipleGenes.  If false, and a read overlaps multiple gene exons, then none of the genes are returned.  If true, return the set of all genes.
 * @return
 */
//TODO: if this is used in the future, make it strand specific.
public Set<Gene> getConsistentExons (final SAMRecord rec, final Set<Gene> genes, final boolean allowMultiGeneReads) {

	Set<Gene> result = new HashSet<>();
	String refName = rec.getReferenceName();
	List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
	for (AlignmentBlock b: alignmentBlocks) {
		Set<Gene> blockGenes = getAlignmentBlockonGeneExon(refName, b, genes);
		result.addAll(blockGenes);
		/*
		// if result is not empty and blockGenes isn't empty, intersect the set and set the result as this new set.
		if (result.size()>0 && blockGenes.size()>0) {
			if (allowMultiGeneReads) result.addAll(blockGenes);
			if (!allowMultiGeneReads)  result.retainAll(blockGenes);
		} else
			// if blockGenes is populated and you're here, then result is empty, so set result to these results
			result=blockGenes;
		*/
	}
	if (!allowMultiGeneReads & result.size()>1) return new HashSet<>();
	return result;
}
 
Example 3
Source File: TagReadWithInterval.java    From Drop-seq with MIT License 6 votes vote down vote up
private SAMRecord tagRead(final SAMRecord r, final OverlapDetector<Interval> od) {
	Set<Interval>intervals = new HashSet<>();
	List<AlignmentBlock> blocks = r.getAlignmentBlocks();
	for (AlignmentBlock b: blocks) {
		int refStart =b.getReferenceStart();
		int refEnd = refStart+b.getLength()-1;
		Interval v = new Interval(r.getReferenceName(), refStart, refEnd);
		Collection<Interval> blockResult = od.getOverlaps(v);
		intervals.addAll(blockResult);
	}
	String tagName = getIntervalName(intervals);
	if (tagName!=null)
		r.setAttribute(this.TAG, tagName);
	else
		r.setAttribute(this.TAG, null);
	return (r);

}
 
Example 4
Source File: SNPUMICellReadIteratorWrapper.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * Check if a read overlaps any SNPs in the OverlapDetector.  Tag reads with SNPs.
 * If more than 1 SNP tags a read, make a read for each SNP.
 * Simplified since data goes through GeneFunctionIteratorWrapper to take care of how reads/genes interact.
 */
private void processSNP (final SAMRecord r) {
	List<AlignmentBlock> blocks = r.getAlignmentBlocks();

	Collection<String> snps = new HashSet<>();

	for (AlignmentBlock b: blocks) {
		int start = b.getReferenceStart();
		int end = start + b.getLength() -1;

		Interval i = new Interval(r.getReferenceName(), start, end);
		Collection<Interval> overlaps = this.snpIntervals.getOverlaps(i);
		for (Interval o: overlaps)
			snps.add(IntervalTagComparator.toString(o));
	}

	// 1 read per SNP.
	for (String snp:snps) {
		SAMRecord rr = Utils.getClone(r);
		rr.setAttribute(this.snpTag, snp);
		queueRecordForOutput(rr);
	}
}
 
Example 5
Source File: AnnotationUtils.java    From Drop-seq with MIT License 5 votes vote down vote up
/**
 * For a read, split into the alignment blocks.
 * For each alignment block, determine the genes the block overlaps, and the locus function of those genes.
 *
 * Each alignment block can generate a different functional annotation on the same gene.  These should be retained.
 * For example, block one can align to an exon, and block two to an intron, and both those annotations are retained.
 *
 * Only retain genes where alignment blocks all reference that gene.  If block one refers to genes A,B and block two to gene A only, then only retain gene A.
 *
 */
public Map<Gene, List<LocusFunction>> getFunctionalDataForRead (final SAMRecord rec, final OverlapDetector<Gene> geneOverlapDetector) {
	List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();

	Map<AlignmentBlock, Map<Gene, List<LocusFunction>>> map = new HashMap<>();
	// gather the locus functions for each alignment block.
	for (AlignmentBlock block: alignmentBlocks) {
		Interval interval = getInterval(rec.getReferenceName(), block);
		Map<Gene, List<LocusFunction>> locusFunctionsForGeneMap = getFunctionalDataForInterval(interval, geneOverlapDetector);
		map.put(block, locusFunctionsForGeneMap);
	}
	// simplify genes by only using genes that are common to all alignment blocks.
	Map<Gene, List<LocusFunction>> result = simplifyFunctionalDataAcrossAlignmentBlocks(map);
	return result;
}
 
Example 6
Source File: EarliestFragmentPrimaryAlignmentSelectionStrategy.java    From picard with MIT License 5 votes vote down vote up
/**
 * Returns 1-based index of first base in read that corresponds to M in CIGAR string.
 * Note that first is relative to 5' end, so that for reverse-strand alignment, the index of
 * the last base aligned is computed relative to the end of the read.
 */
int getIndexOfFirstAlignedBase(final SAMRecord rec) {
    final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
    if (rec.getReadNegativeStrandFlag()) {
        final AlignmentBlock alignmentBlock = alignmentBlocks.get(alignmentBlocks.size() - 1);
        return rec.getReadLength() - CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) + 1;
    } else {
        return alignmentBlocks.get(0).getReadStart();
    }
}
 
Example 7
Source File: CountingFilter.java    From picard with MIT License 5 votes vote down vote up
@Override
public final boolean filterOut(final SAMRecord record) {
    final boolean filteredOut = reallyFilterOut(record);
    if (filteredOut) {
        ++filteredRecords;
        for (final AlignmentBlock block : record.getAlignmentBlocks()) {
            this.filteredBases += block.getLength();
        }
    }
    return filteredOut;
}
 
Example 8
Source File: EarliestFragmentPrimaryAlignmentSelectionStrategy.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Returns 1-based index of first base in read that corresponds to M in CIGAR string.
 * Note that first is relative to 5' end, so that for reverse-strand alignment, the index of
 * the last base aligned is computed relative to the end of the read.
 */
int getIndexOfFirstAlignedBase(final SAMRecord rec) {
    final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
    if (rec.getReadNegativeStrandFlag()) {
        final AlignmentBlock alignmentBlock = alignmentBlocks.get(alignmentBlocks.size() - 1);
        return rec.getReadLength() - CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) + 1;
    } else {
        return alignmentBlocks.get(0).getReadStart();
    }
}
 
Example 9
Source File: CollectSequencingArtifactMetrics.java    From picard with MIT License 4 votes vote down vote up
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
    // see if the whole read should be skipped
    if (recordFilter.filterOut(rec)) return;

    // check read group + library
    final String library = (rec.getReadGroup() == null) ? UNKNOWN_LIBRARY : getOrElse(rec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY);
    if (!libraries.contains(library)) {
        // should never happen if SAM is valid
        throw new PicardException("Record contains library that is missing from header: " + library);
    }

    // set up some constants that don't change in the loop below
    final int contextFullLength = 2 * CONTEXT_SIZE + 1;
    final ArtifactCounter counter = artifactCounters.get(library);
    final byte[] readBases = rec.getReadBases();
    final byte[] readQuals;
    if (USE_OQ) {
        final byte[] tmp = rec.getOriginalBaseQualities();
        readQuals = tmp == null ? rec.getBaseQualities() : tmp;
    } else {
        readQuals = rec.getBaseQualities();
    }

    // iterate over aligned positions
    for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
        for (int offset = 0; offset < block.getLength(); offset++) {
            // remember, these are 1-based!
            final int readPos = block.getReadStart() + offset;
            final int refPos = block.getReferenceStart() + offset;

            // skip low BQ sites
            final byte qual = readQuals[readPos - 1];
            if (qual < MINIMUM_QUALITY_SCORE) continue;

            // skip N bases in read
            final char readBase = Character.toUpperCase((char)readBases[readPos - 1]);
            if (readBase == 'N') continue;

            /**
             * Skip regions outside of intervals.
             *
             * NB: IntervalListReferenceSequenceMask.get() has side-effects which assume
             * that successive ReferenceSequence's passed to this method will be in-order
             * (e.g. it will break if you call acceptRead() with chr1, then chr2, then chr1
             * again). So this only works if the underlying iteration is coordinate-sorted.
             */
            if (intervalMask != null && !intervalMask.get(ref.getContigIndex(), refPos)) continue;

            // skip dbSNP sites
            if (dbSnpMask != null && dbSnpMask.isDbSnpSite(ref.getName(), refPos)) continue;

            // skip the ends of the reference
            final int contextStartIndex = refPos - CONTEXT_SIZE - 1;
            if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;

            // skip contexts with N bases
            final String context = getRefContext(ref, contextStartIndex, contextFullLength);
            if (context.contains("N")) continue;

            // skip non-ACGT bases
            if (!SequenceUtil.isUpperACGTN((byte)readBase))
                continue;

            // count the base!
            counter.countRecord(context, readBase, rec);
        }
    }
}
 
Example 10
Source File: RrbsMetricsCollector.java    From picard with MIT License 4 votes vote down vote up
public void acceptRecord(final SAMRecordAndReference args) {
	mappedRecordCount++;

	final SAMRecord samRecord = args.getSamRecord();
	final ReferenceSequence referenceSequence = args.getReferenceSequence();

	final byte[] readBases = samRecord.getReadBases();
	final byte[] readQualities = samRecord.getBaseQualities();
	final byte[] refBases = referenceSequence.getBases();

	if (samRecord.getReadLength() < minReadLength) {
		smallReadCount++;
		return;
	} else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) {
		mismatchCount++;
		return;
	}

	// We only record non-CpG C sites if there was at least one CpG in the read, keep track of
	// the values for this record and then apply to the global totals if valid
	int recordCpgs = 0;

	for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) {
		final int blockLength = alignmentBlock.getLength();
		final int refFragmentStart = alignmentBlock.getReferenceStart() - 1;
		final int readFragmentStart = alignmentBlock.getReadStart() - 1;

		final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength);
		final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength);
		final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength);

		if (samRecord.getReadNegativeStrandFlag()) {
			// In the case of a negative strand, reverse (and complement for base arrays) the reference,
			// reads & qualities so that it can be treated as a positive strand for the rest of the process
			SequenceUtil.reverseComplement(refFragment);
			SequenceUtil.reverseComplement(readFragment);
			SequenceUtil.reverseQualities(readQualityFragment);
		}

		for (int i=0; i < blockLength-1; i++) {
			final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag());

			// Look at a 2-base window to see if we're on a CpG site, and if so check for conversion
			// (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read
			if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) &&
					(SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) {
				// We want to catch the case where there's a CpG in the reference, even if it is not valid
				// to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all
				// in one if statement
				if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) {
					recordCpgs++;
					final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex);
					cpgTotal.increment(curLocation);
					if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
						cpgConverted.increment(curLocation);
					}
				}
				i++;
			} else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) &&
					SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) {
				// C base in the reference that's not associated with a CpG
				nCytoTotal++;
				if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
					nCytoConverted++;
				}
			}
		}
	}

	if (recordCpgs == 0) {
		noCpgCount++;
	}
}