Java Code Examples for htsjdk.samtools.Cigar#getReferenceLength()

The following examples show how to use htsjdk.samtools.Cigar#getReferenceLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HaplotypeUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "TrimmingDataWithLeadingAndTrailingInsertions")
public void testTrimLeadingAndTrailingInsertions(final String bases, final String cigar, final int start, final int stop, final String expectedTrimmedCigar, final String expectedTrimmedBases) {
    final int offset = 10;
    final Cigar untrimmedCigar = TextCigarCodec.decode(cigar);
    final Haplotype haplotype = new Haplotype(bases.getBytes(), new UnvalidatingGenomeLoc("20", 0, offset, offset + untrimmedCigar.getReferenceLength() - 1));
    haplotype.setAlignmentStartHapwrtRef(offset);
    haplotype.setCigar(untrimmedCigar);

    final Haplotype trimmed = haplotype.trim(new SimpleInterval("20", offset + start, offset + stop));
    Assert.assertEquals(trimmed.getCigar().toString(), expectedTrimmedCigar);
    Assert.assertEquals(trimmed.getBaseString(), expectedTrimmedBases);
}
 
Example 2
Source File: ContigAlignmentsModifierUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static Iterable<AlignmentInterval> willThrowOnInvalidCigar(final Cigar cigar, final int readStart) throws GATKException {
    final AlignmentInterval detailsDoesnotMatter = new AlignmentInterval(
            new SimpleInterval("1", 1, cigar.getReferenceLength()),
            readStart, readStart+cigar.getReadLength() - CigarUtils.countClippedBases(cigar, CigarOperator.SOFT_CLIP), cigar,
            true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
    return ContigAlignmentsModifier.splitGappedAlignment(detailsDoesnotMatter, 1, cigar.getReadLength() + CigarUtils.countClippedBases(cigar, CigarOperator.HARD_CLIP));
}
 
Example 3
Source File: ContigAligner.java    From abra2 with MIT License 4 votes vote down vote up
public ContigAlignerResult align(String seq) {
		
		ContigAlignerResult result = null;
		
		SemiGlobalAligner.Result sgResult = aligner.align(seq, ref);
		Logger.trace("SG Alignment [%s]:\t%s, possible: %d to: %s", seq, sgResult, seq.length()*MATCH, ref);
		if (sgResult.score > MIN_ALIGNMENT_SCORE && sgResult.score > sgResult.secondBest && sgResult.endPosition > 0) {
			Cigar cigar = TextCigarCodec.decode(sgResult.cigar);
			
			CigarElement first = cigar.getFirstCigarElement();
			CigarElement last = cigar.getLastCigarElement();
		
			if (first.getOperator() == CigarOperator.I) {
				Logger.trace("Contig with leading insert: [%s] - [%s]", sgResult, seq);
			}
			
			// Do not allow indels at the edges of contigs.
			if (minAnchorLength > 0 && 
				(first.getOperator() != CigarOperator.M || first.getLength() < minAnchorLength || 
				last.getOperator() != CigarOperator.M || last.getLength() < minAnchorLength)) {

//				if ((first.getOperator() != CigarOperator.M || last.getOperator() != CigarOperator.M) &&
//						cigar.toString().contains("I")) {
//					Logger.trace("INDEL_NEAR_END: %s", cigar.toString());
//					return ContigAlignerResult.INDEL_NEAR_END;
//				}
//				
//				if ((first.getLength() < 5 || last.getLength() < 5) && 
//						cigar.toString().contains("I") &&
//						minAnchorLength >= 5) {
//					Logger.trace("INDEL_NEAR_END: %s", cigar.toString());
//					return ContigAlignerResult.INDEL_NEAR_END;						
//				}

				return null;
			}
				
			int endPos = sgResult.position + cigar.getReferenceLength();
			
			// Require first and last minAnchorLength bases of contig to be similar to ref
			int mismatches = 0;
			for (int i=0; i<minAnchorLength; i++) {
				if (seq.charAt(i) != ref.charAt(sgResult.position+i)) {
					mismatches += 1;
				}
			}
			
			if (mismatches > maxAnchorMismatches) {
				Logger.trace("Mismatches at beginning of: %s", seq);
			} else {
			
				mismatches = 0;
				for (int i=minAnchorLength; i>0; i--) {
					
					int seqIdx = seq.length()-i;
					int refIdx = endPos-i;
					
					if (seq.charAt(seqIdx) != ref.charAt(refIdx)) {
						mismatches += 1;
					}
				}
				
				if (mismatches > maxAnchorMismatches) {
					Logger.trace("Mismatches at end of: %s", seq);
				} else {
					result = finishAlignment(sgResult.position, endPos, sgResult.cigar, sgResult.score, seq);
				}
			}
		}
		
		return result;
	}
 
Example 4
Source File: AlignmentUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference
 * via the alignment of haplotype (via its getCigar) method.
 *
 * @param originalRead the read we want to write aligned to the reference genome
 * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
 * @param referenceStart the start of the reference that haplotype is aligned to.  Provides global coordinate frame.
 * @param isInformative true if the read is differentially informative for one of the haplotypes
 *
 * @param aligner
 * @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it
 *   does not have a Cigar or the {@code referenceStart} is invalid (less than 1).
 *
 * @return a GATKRead aligned to reference. Never {@code null}.
 */
public static GATKRead createReadAlignedToRef(final GATKRead originalRead,
                                              final Haplotype haplotype,
                                              final Haplotype refHaplotype,
                                              final int referenceStart,
                                              final boolean isInformative,
                                              final SmithWatermanAligner aligner) {
    Utils.nonNull(originalRead);
    Utils.nonNull(haplotype);
    Utils.nonNull(refHaplotype);
    Utils.nonNull(haplotype.getCigar());
    Utils.nonNull(aligner);
    if ( referenceStart < 1 ) { throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); }

    // compute the smith-waterman alignment of read -> haplotype
    final SmithWatermanAlignment readToHaplotypeSWAlignment = aligner.align(haplotype.getBases(), originalRead.getBases(), CigarUtils.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
    if ( readToHaplotypeSWAlignment.getAlignmentOffset() == -1 ) {
        // sw can fail (reasons not clear) so if it happens just don't realign the read
        return originalRead;
    }

    final Cigar swCigar = new CigarBuilder().addAll(readToHaplotypeSWAlignment.getCigar()).make();

    // since we're modifying the read we need to clone it
    final GATKRead read = originalRead.copy();

    // only informative reads are given the haplotype tag to enhance visualization
    if ( isInformative ) {
        read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode());
    }

    // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar
    final Cigar rightPaddedHaplotypeVsRefCigar = haplotype.getConsolidatedPaddedCigar(1000);

    // this computes the number of reference bases before the read starts, based on the haplotype vs ref cigar
    // This cigar corresponds exactly to the readToRefCigarRaw, below.  One might wonder whether readToRefCigarRaw and
    // readToRefCigarClean ever imply different starts, which could occur if if the former has a leading deletion.  However,
    // according to the logic of applyCigarToCigar, this can only happen if the read has a leading deletion wrt its best haplotype,
    // which our SW aligner won't do, or if the read starts on a haplotype base that is in a deletion wrt to reference, which is nonsensical
    // since a base that exists is not a deletion.  Thus, there is nothing to worry about, in contrast to below where we do check
    // whether left-alignment shifted the start position.
    final int readStartOnReferenceHaplotype = readStartOnReferenceHaplotype(rightPaddedHaplotypeVsRefCigar, readToHaplotypeSWAlignment.getAlignmentOffset());


    //final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;

    final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnReferenceHaplotype;

    // compute the read -> ref alignment by mapping read -> hap -> ref from the
    // SW of read -> hap mapped through the given by hap -> ref

    // this is the sub-cigar of the haplotype-to-ref alignment, with cigar elements before the read start removed.  Elements after the read end are kept.
    final Cigar haplotypeToRef = trimCigarByBases(rightPaddedHaplotypeVsRefCigar, readToHaplotypeSWAlignment.getAlignmentOffset(), rightPaddedHaplotypeVsRefCigar.getReadLength() - 1).getCigar();

    final Cigar readToRefCigar = applyCigarToCigar(swCigar, haplotypeToRef);
    final CigarBuilder.Result leftAlignedReadToRefCigarResult = leftAlignIndels(readToRefCigar, refHaplotype.getBases(), originalRead.getBases(), readStartOnReferenceHaplotype);
    final Cigar leftAlignedReadToRefCigar = leftAlignedReadToRefCigarResult.getCigar();
    // it's possible that left-alignment shifted a deletion to the beginning of a read and removed it, shifting the first aligned base to the right
    read.setPosition(read.getContig(), readStartOnReference + leftAlignedReadToRefCigarResult.getLeadingDeletionBasesRemoved());

    // the SW Cigar does not contain the hard clips of the original read
    final Cigar originalCigar = originalRead.getCigar();
    final CigarElement firstElement = originalCigar.getFirstCigarElement();
    final CigarElement lastElement = originalCigar.getLastCigarElement();
    final List<CigarElement> readToRefCigarElementsWithHardClips = new ArrayList<>();
    if (firstElement.getOperator() == CigarOperator.HARD_CLIP) {
        readToRefCigarElementsWithHardClips.add(firstElement);
    }
    readToRefCigarElementsWithHardClips.addAll(leftAlignedReadToRefCigar.getCigarElements());
    if (lastElement.getOperator() == CigarOperator.HARD_CLIP) {
        readToRefCigarElementsWithHardClips.add(lastElement);
    }

    read.setCigar(new Cigar(readToRefCigarElementsWithHardClips));

    if ( leftAlignedReadToRefCigar.getReadLength() != read.getLength() ) {
        throw new GATKException("Cigar " + leftAlignedReadToRefCigar + " with read length " + leftAlignedReadToRefCigar.getReadLength()
                + " != read length " + read.getLength() + " for read " + read.toString() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength()
                + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength());
    }

    return read;
}
 
Example 5
Source File: AlignmentUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions)
 *
 * @param cigar            the read's CIGAR -- cannot be null
 * @param read             the read's base array
 * @return a non-null array of bases (bytes)
 */
@SuppressWarnings("fallthrough")
public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) {
    Utils.nonNull(cigar);
    Utils.nonNull(read);

    final int alignmentLength = cigar.getReferenceLength();
    final byte[] alignment = new byte[alignmentLength];
    int alignPos = 0;
    int readPos = 0;
    for (int iii = 0; iii < cigar.numCigarElements(); iii++) {

        final CigarElement ce = cigar.getCigarElement(iii);
        final int elementLength = ce.getLength();

        switch (ce.getOperator()) {
            case I:
                if (alignPos > 0) {
                    final int prevPos = alignPos - 1;
                    if (alignment[prevPos] == BaseUtils.Base.A.base) {
                        alignment[prevPos] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE;
                    } else if (alignment[prevPos] == BaseUtils.Base.C.base) {
                        alignment[prevPos] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE;
                    } else if (alignment[prevPos] == BaseUtils.Base.T.base) {
                        alignment[prevPos] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE;
                    } else if (alignment[prevPos] == BaseUtils.Base.G.base) {
                        alignment[prevPos] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE;
                    }
                }
            case S:
                readPos += elementLength;
                break;
            case D:
            case N:
                for (int jjj = 0; jjj < elementLength; jjj++) {
                    alignment[alignPos++] = PileupElement.DELETION_BASE;
                }
                break;
            case M:
            case EQ:
            case X:
                for (int jjj = 0; jjj < elementLength; jjj++) {
                    alignment[alignPos++] = read[readPos++];
                }
                break;
            case H:
            case P:
                break;
            default:
                throw new GATKException("Unsupported cigar operator: " + ce.getOperator());
        }
    }
    return alignment;
}
 
Example 6
Source File: AlignmentUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
 * starting at 0-based position <code>readStart</code> on the <code>ref</code> and specified by its <code>cigar</code>.
 * <p/>
 * If the alignment has one or more indels, this method attempts to move them left across a stretch of repetitive bases.
 * For instance, if the original cigar specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output
 * cigar will always mark the leftmost AT as deleted. If there is no indel in the original cigar or if the indel position
 * is determined unambiguously (i.e. inserted/deleted sequence is not repeated), the original cigar is returned.
 *
 *
 * @param cigar     structure of the original alignment
 * @param ref    reference sequence the read is aligned to
 * @param read   read sequence
 * @param readStart  0-based alignment start position on ref
 * @return a non-null cigar, in which the indels are guaranteed to be placed at the leftmost possible position across a repeat (if any)
 */
public static CigarBuilder.Result leftAlignIndels(final Cigar cigar, final byte[] ref, final byte[] read, final int readStart) {
    ParamUtils.isPositiveOrZero(readStart, "read start within reference base array must be non-negative");

    if (cigar.getCigarElements().stream().noneMatch(elem -> elem.getOperator().isIndel())) {
        return new CigarBuilder.Result(cigar, 0, 0);
    }

    // we need reference bases from the start of the read to the rightmost indel
    final int lastIndel = IntStream.range(0, cigar.numCigarElements()).filter(n -> cigar.getCigarElement(n).getOperator().isIndel()).max().getAsInt();
    final int necessaryRefLength = readStart + cigar.getCigarElements().stream().limit(lastIndel + 1).mapToInt(e -> lengthOnReference(e)).sum();
    Utils.validateArg(necessaryRefLength <= ref.length, "read goes past end of reference");

    // at this point, we are one base past the end of the read.  Now we traverse the cigar from right to left
    final List<CigarElement> resultRightToLeft = new ArrayList<>();
    final int refLength = cigar.getReferenceLength();
    final IndexRange refIndelRange = new IndexRange(readStart + refLength, readStart + refLength);
    final IndexRange readIndelRange = new IndexRange(read.length,read.length);
    for (int n = cigar.numCigarElements() - 1; n >= 0; n--) {
        final CigarElement element = cigar.getCigarElement(n);
        // if it's an indel, just accumulate the read and ref bases consumed.  We won't shift the indel until we hit an alignment
        // block or the read start.
        if (element.getOperator().isIndel()) {
            refIndelRange.shiftStartLeft(lengthOnReference(element));
            readIndelRange.shiftStartLeft(lengthOnRead(element));
        } else if (refIndelRange.size() == 0 && readIndelRange.size() == 0) {   // no indel, just add the cigar element to the result
            resultRightToLeft.add(element);
            refIndelRange.shiftLeft(lengthOnReference(element));
            readIndelRange.shiftLeft(lengthOnRead(element));
        } else {    // there's an indel that we need to left-align
            // we can left-align into match cigar elements but not into clips
            final int maxShift = element.getOperator().isAlignment() ? element.getLength() : 0;
            final Pair<Integer, Integer> shifts = normalizeAlleles(Arrays.asList(ref, read), Arrays.asList(refIndelRange, readIndelRange), maxShift, true);

            // account for new match alignments on the right due to left-alignment
            resultRightToLeft.add(new CigarElement(shifts.getRight(), CigarOperator.MATCH_OR_MISMATCH));

            // emit if we didn't go all the way to the start of an alignment block OR we have reached clips OR we have reached the start of the cigar
            final boolean emitIndel = n == 0 || shifts.getLeft() < maxShift || !element.getOperator().isAlignment();
            final int newMatchOnLeftDueToTrimming = shifts.getLeft() < 0 ? -shifts.getLeft() : 0;   // we may have actually shifted right to make the alleles parsimonious
            final int remainingBasesOnLeft = shifts.getLeft() < 0 ? element.getLength() : (element.getLength() - shifts.getLeft());

            if (emitIndel) {  // some of this alignment block remains after left-alignment -- emit the indel
                resultRightToLeft.add(new CigarElement(refIndelRange.size(), CigarOperator.DELETION));
                resultRightToLeft.add(new CigarElement(readIndelRange.size(), CigarOperator.INSERTION));
                refIndelRange.shiftEndLeft(refIndelRange.size());       // ref is empty and points to start of left-aligned indel
                readIndelRange.shiftEndLeft(readIndelRange.size());     // read is empty and points to start of left-aligned indel
                refIndelRange.shiftLeft(remainingBasesOnLeft + newMatchOnLeftDueToTrimming);          // ref is empty and points to end of element preceding this match block
                readIndelRange.shiftLeft(remainingBasesOnLeft + newMatchOnLeftDueToTrimming);         // read is empty and points to end of element preceding this match block
            }
            resultRightToLeft.add(new CigarElement(newMatchOnLeftDueToTrimming, CigarOperator.MATCH_OR_MISMATCH));
            resultRightToLeft.add(new CigarElement(remainingBasesOnLeft, element.getOperator()));
        }
    }

    // account for any indels at the start of the cigar that weren't processed because they have no adjacent non-indel element to the left
    resultRightToLeft.add(new CigarElement(refIndelRange.size(), CigarOperator.DELETION));
    resultRightToLeft.add(new CigarElement(readIndelRange.size(), CigarOperator.INSERTION));

    Utils.validateArg(readIndelRange.getStart() == 0, "Given cigar does not account for all bases of the read");
    return new CigarBuilder().addAll(Lists.reverse(resultRightToLeft)).makeAndRecordDeletionsRemovedResult();
}