Java Code Examples for htsjdk.samtools.CigarOperator#INSERTION

The following examples show how to use htsjdk.samtools.CigarOperator#INSERTION . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CigarUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Convert the 'I' CigarElement, if it is at either end (terminal) of the input cigar, to a corresponding 'S' operator.
 * Note that we allow CIGAR of the format '10H10S10I10M', but disallows the format if after the conversion the cigar turns into a giant clip,
 * e.g. '10H10S10I10S10H' is not allowed (if allowed, it becomes a giant clip of '10H30S10H' which is non-sense).
 *
 * @return a pair of number of clipped (hard and soft, including the ones from the converted terminal 'I') bases at the front and back of the
 *         input {@code cigarAlongInput5to3Direction}.
 *
 * @throws IllegalArgumentException when the checks as described above fail.
 */
public static Cigar convertTerminalInsertionToSoftClip(final Cigar cigar) {

    if (cigar.numCigarElements() < 2 ) {
        return cigar;
    }

    final CigarBuilder builder = new CigarBuilder();
    for (int n = 0; n < cigar.numCigarElements(); n++) {
        final CigarElement element = cigar.getCigarElement(n);
        if (element.getOperator() != CigarOperator.INSERTION) { // not an insertion
            builder.add(element);
        } else if (n == 0 || n == cigar.numCigarElements() - 1) {   // terminal insertion with no clipping -- convert to soft clip
            builder.add(new CigarElement(element.getLength(), CigarOperator.SOFT_CLIP));
        } else if (cigar.getCigarElement(n-1).getOperator().isClipping() || cigar.getCigarElement(n+1).getOperator().isClipping()) {    // insertion preceding or following clip
            builder.add(new CigarElement(element.getLength(), CigarOperator.SOFT_CLIP));
        } else {    // interior insertion
            builder.add(element);
        }
    }

    return builder.make();
}
 
Example 2
Source File: ReadClassifier.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private void checkBigIndel( final List<CigarElement> cigarElements,
                           final GATKRead read,
                           final List<BreakpointEvidence> evidenceList ) {
    int locus = read.getStart();
    for ( final CigarElement ele : cigarElements ) {
        final CigarOperator op = ele.getOperator();
        if ( ele.getLength() >= MIN_INDEL_LEN ) {
            if ( op == CigarOperator.INSERTION ) {
                evidenceList.add(new BreakpointEvidence.LargeIndel(read, readMetadata, locus));
            } else if ( op == CigarOperator.DELETION ) {
                evidenceList.add(new BreakpointEvidence.LargeIndel(read, readMetadata, locus+ele.getLength()/2));
            }
        }
        if ( op.consumesReferenceBases() ) {
            locus += ele.getLength();
        }
    }
}
 
Example 3
Source File: CigarUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Checks if cigar has consecutive I/D elements.
 */
private static boolean hasConsecutiveIndels(final List<CigarElement> elems) {
    boolean prevIndel = false;
    for (final CigarElement elem : elems) {
        final CigarOperator op = elem.getOperator();
        final boolean isIndel = (op == CigarOperator.INSERTION || op == CigarOperator.DELETION);
        if (prevIndel && isIndel) {
            return true;
        }
        prevIndel = isIndel;
    }
    return false;
}
 
Example 4
Source File: ReadPosRankSumTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static OptionalDouble getReadPosition(final GATKRead read, final VariantContext vc) {
    Utils.nonNull(read);

    // edge case: if a read starts with an insertion then the variant context's end is the reference base BEFORE the read start,
    // which appears like no overlap
    if (read.getStart() == vc.getEnd() + 1 && read.getCigarElements().stream().map(CigarElement::getOperator)
            .filter(o -> !o.isClipping()).findFirst().orElse(null) == CigarOperator.INSERTION) {
        return OptionalDouble.of(0);
    }

    final Pair<Integer, CigarOperator> offset = ReadUtils.getReadIndexForReferenceCoordinate(read, vc.getStart());

    if (offset.getLeft() == ReadUtils.READ_INDEX_NOT_FOUND) {
        return OptionalDouble.empty();
    }

    // hard clips at this point in the code are perfectly good bases that were clipped to make the read fit the assembly region
    final Cigar cigar = read.getCigar();
    final CigarElement firstElement = cigar.getFirstCigarElement();
    final CigarElement lastElement = cigar.getLastCigarElement();
    final int leadingHardClips = firstElement.getOperator() == CigarOperator.HARD_CLIP ? firstElement.getLength() : 0;
    final int trailingHardClips = lastElement.getOperator() == CigarOperator.HARD_CLIP ? lastElement.getLength() : 0;


    final int leftDistance = leadingHardClips + offset.getLeft();
    final int rightDistance = read.getLength() - 1 - offset.getLeft() + trailingHardClips;
    return OptionalDouble.of(Math.min(leftDistance, rightDistance));
}
 
Example 5
Source File: IndelShifter.java    From abra2 with MIT License 4 votes vote down vote up
public Cigar shiftAllIndelsLeft(int refStart, int refEnd, String chromosome, Cigar cigar, String seq, CompareToReference2 c2r) {
	
	try {
	
		List<CigarElement> elems = new ArrayList<CigarElement>(cigar.getCigarElements());
		
		int elemSize = elems.size();
		
		for (int i=1; i<elemSize-1; i++) {
			
			int refOffset = 0;
			int readOffset = 0;
			
			for (int j=0; j<i-1; j++) {
				refOffset += getRefOffset(elems.get(j));
				readOffset += getReadOffset(elems.get(j));
			}
			
			CigarElement prev = elems.get(i-1);
			CigarElement elem = elems.get(i);
			CigarElement next = elems.get(i+1);
			
			if ((elem.getOperator() == CigarOperator.DELETION || elem.getOperator() == CigarOperator.INSERTION) &&
				prev.getOperator() == CigarOperator.MATCH_OR_MISMATCH && next.getOperator() == CigarOperator.MATCH_OR_MISMATCH) {
				
				// subset cigar here and attempt to shift
				Cigar subCigar = new Cigar(Arrays.asList(prev, elem, next));
				String subSeq = seq.substring(readOffset, readOffset+subCigar.getReadLength());
				Cigar newCigar = shiftIndelsLeft(refStart + refOffset, refStart + refOffset + subCigar.getReferenceLength(),
						chromosome, subCigar, subSeq, c2r);
				
				//TODO: Merge abutting indels if applicable
				elems.set(i-1, newCigar.getCigarElement(0));
				elems.set(i, newCigar.getCigarElement(1));
				if (newCigar.getCigarElements().size() == 3) {
					elems.set(i+1, newCigar.getCigarElement(2));
				} else {
					elems.remove(i+1);
					elemSize -= 1;
				}
			}
		}
		
		mergeAbuttingElements(elems);
		return new Cigar(elems);
	} catch (RuntimeException e) {
		e.printStackTrace();
		Logger.error("Error on: " + refStart + ", " + refEnd + "," + chromosome + ", " + cigar + ", " + seq);
		throw e;
	}
}
 
Example 6
Source File: PalindromeArtifactClipReadTransformer.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public GATKRead apply(final GATKRead read) {
    final int adaptorBoundary = read.getAdaptorBoundary();
    if (!read.isProperlyPaired() || adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY) {
        return read;
    }

    final Cigar cigar = read.getCigar();
    final CigarOperator firstOperator = cigar.getFirstCigarElement().getOperator();
    final CigarOperator lastOperator = cigar.getLastCigarElement().getOperator();
    final boolean readIsUpstreamOfMate = read.getFragmentLength() > 0;

    if ((readIsUpstreamOfMate && firstOperator != CigarOperator.SOFT_CLIP && firstOperator != CigarOperator.INSERTION) ||
            (!readIsUpstreamOfMate && lastOperator != CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.INSERTION)) {
        return read;
    }

    final int potentialArtifactBaseCount = readIsUpstreamOfMate ? cigar.getFirstCigarElement().getLength() :
            cigar.getLastCigarElement().getLength();


    final int numBasesToCompare = Math.min(potentialArtifactBaseCount + minPalindromeSize, read.getLength());

    // the reference position of bases that are the reverse complement of the suspected artifact
    final String contig = read.getContig();
    final int refStart = readIsUpstreamOfMate ? adaptorBoundary - numBasesToCompare : adaptorBoundary + 1;
    final int refEnd = readIsUpstreamOfMate ? adaptorBoundary - 1 : adaptorBoundary + numBasesToCompare;
    
    // it's possible that the read's assigned fragment length / mate start were incompatible with the contig length,
    // so we explicitly check for this case to avoid errors below.  We can't rely on the alignment!
    if (refStart < 1 || refEnd > sequenceDictionary.getSequence(contig).getSequenceLength()) {
        return read;
    }

    // if the reference bases overlap the soft-clipped bases, it's not an artifact.  Note that read.getStart() is the unclipped start
    // this can happen, for a read with a huge soft-clip that overlaps its mate significantly.
    if ( (readIsUpstreamOfMate && refStart < read.getStart()) || (!readIsUpstreamOfMate && read.getEnd() < refEnd)) {
        return read;
    }
    final SimpleInterval refInterval = new SimpleInterval(contig, refStart, refEnd);
    final MutableInt numMatch = new MutableInt(0);

    // we traverse the reference in the forward direction, hence the read in the reverse direction
    final MutableInt readIndex = new MutableInt(readIsUpstreamOfMate ? numBasesToCompare - 1 : read.getLength() - 1);
    referenceDataSource.query(refInterval).forEachRemaining(refBase -> {
        if (BaseUtils.getComplement(refBase) == read.getBase(readIndex.getValue())) {
            numMatch.increment();
        }
        readIndex.decrement();
    });

    if (numMatch.doubleValue() / numBasesToCompare >= MIN_FRACTION_OF_MATCHING_BASES) {
        final ReadClipper readClipper = new ReadClipper(read);
        final ClippingOp clippingOp = readIsUpstreamOfMate ? new ClippingOp(0, potentialArtifactBaseCount - 1) :
                new ClippingOp(read.getLength() - potentialArtifactBaseCount, read.getLength());
        readClipper.addOp(clippingOp);
        return readClipper.clipRead(ClippingRepresentation.HARDCLIP_BASES);
    } else {
        return read;
    }
}
 
Example 7
Source File: CigarBuilder.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private boolean lastTwoElementsWereDeletionAndInsertion() {
    return lastOperator == CigarOperator.INSERTION && cigarElements.size() > 1 && cigarElements.get(cigarElements.size() - 2).getOperator() == CigarOperator.DELETION;
}
 
Example 8
Source File: AlignmentUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter) {
    Utils.nonNull(read);
    // As this code is performance sensitive in the HaplotypeCaller, we elect to use the noCopy versions of these getters.
    // We can do this because we don't mutate base or quality arrays in this method or in its accessors
    final byte[] bases = read.getBasesNoCopy();
    final byte[] baseQualities = read.getBaseQualitiesNoCopy();
    final int numCigarElements = read.numCigarElements();
    boolean sawIndel = false;

    // Check if the cigar contains indels
    // Note that we don't call ContainsOperator() here twice to avoid the performance hit of building stream iterators twice
    for (int i = 0; i < numCigarElements; i++) {
        final CigarOperator e = read.getCigarElement(i).getOperator();
        if (e == CigarOperator.INSERTION || e == CigarOperator.DELETION) {
            sawIndel = true;
            break;
        }
    }
    if (!sawIndel) {
        return new ImmutablePair<>(bases, baseQualities);
    }
    else {
        final int numberRefBasesIncludingSoftclips = CigarUtils.countRefBasesAndSoftClips(read.getCigarElements(), 0, numCigarElements);
        final byte[] paddedBases = new byte[numberRefBasesIncludingSoftclips];
        final byte[] paddedBaseQualities = new byte[numberRefBasesIncludingSoftclips];
        int literalPos = 0;
        int paddedPos = 0;
        for ( int i = 0; i < numCigarElements; i++ ) {
            final CigarElement ce = read.getCigarElement(i);
            final CigarOperator co = ce.getOperator();
            if (co.consumesReadBases()) {
                if (!co.consumesReferenceBases()) {
                    literalPos += ce.getLength();  //skip inserted bases
                }
                else {
                    System.arraycopy(bases, literalPos, paddedBases, paddedPos, ce.getLength());
                    System.arraycopy(baseQualities, literalPos, paddedBaseQualities, paddedPos, ce.getLength());
                    literalPos += ce.getLength();
                    paddedPos += ce.getLength();
                }
            }
            else if (co.consumesReferenceBases()) {
                for ( int j = 0; j < ce.getLength(); j++ ) {  //pad deleted bases
                    paddedBases[paddedPos] = basePadCharacter;
                    paddedBaseQualities[paddedPos] = qualityPadCharacter;
                    paddedPos++;
                }
            }
        }
        return new ImmutablePair<>(paddedBases, paddedBaseQualities);
    }
}
 
Example 9
Source File: ReadClipperUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(final Cigar cigar) {
    for (final CigarElement cigarElement : cigar.getCigarElements())
        if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
            return true;
    return false;
}
 
Example 10
Source File: Utils.java    From cramtools with Apache License 2.0 4 votes vote down vote up
/**
 * A rip off samtools bam_md.c
 * 
 * @param record
 * @param ref
 * @param calcMD
 * @param calcNM
 */
public static void calculateMdAndNmTags(SAMRecord record, byte[] ref, boolean calcMD, boolean calcNM) {
	if (!calcMD && !calcNM)
		return;

	Cigar cigar = record.getCigar();
	List<CigarElement> cigarElements = cigar.getCigarElements();
	byte[] seq = record.getReadBases();
	int start = record.getAlignmentStart() - 1;
	int i, x, y, u = 0;
	int nm = 0;
	StringBuffer str = new StringBuffer();

	int size = cigarElements.size();
	for (i = y = 0, x = start; i < size; ++i) {
		CigarElement ce = cigarElements.get(i);
		int j, l = ce.getLength();
		CigarOperator op = ce.getOperator();
		if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ || op == CigarOperator.X) {
			for (j = 0; j < l; ++j) {
				int z = y + j;

				if (ref.length <= x + j)
					break; // out of boundary

				int c1 = 0;
				int c2 = 0;
				// try {
				c1 = seq[z];
				c2 = ref[x + j];

				if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
					// a match
					++u;
				} else {
					str.append(u);
					str.appendCodePoint(ref[x + j]);
					u = 0;
					++nm;
				}
			}
			if (j < l)
				break;
			x += l;
			y += l;
		} else if (op == CigarOperator.DELETION) {
			str.append(u);
			str.append('^');
			for (j = 0; j < l; ++j) {
				if (ref[x + j] == 0)
					break;
				str.appendCodePoint(ref[x + j]);
			}
			u = 0;
			if (j < l)
				break;
			x += l;
			nm += l;
		} else if (op == CigarOperator.INSERTION || op == CigarOperator.SOFT_CLIP) {
			y += l;
			if (op == CigarOperator.INSERTION)
				nm += l;
		} else if (op == CigarOperator.SKIPPED_REGION) {
			x += l;
		}
	}
	str.append(u);

	if (calcMD)
		record.setAttribute(SAMTag.MD.name(), str.toString());
	if (calcNM)
		record.setAttribute(SAMTag.NM.name(), nm);
}
 
Example 11
Source File: AlignmentsTags.java    From cramtools with Apache License 2.0 4 votes vote down vote up
/**
 * Same as {@link htsjdk.samtools.util.SequenceUtil#calculateMdAndNmTags}
 * but allows for reference excerpt instead of a full reference sequence.
 * 
 * @param record
 *            SAMRecord to inject NM and MD tags into
 * @param ref
 *            reference sequence bases, may be a subsequence starting at
 *            refOffest
 * @param refOffset
 *            array offset of the reference bases, 0 is the same as the
 *            whole sequence. This value will be subtracted from the
 *            reference array index in calculations.
 * @param calcMD
 *            calculate MD tag if true
 * @param calcNM
 *            calculate NM tag if true
 */
public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref, final int refOffset,
		final boolean calcMD, final boolean calcNM) {
	if (!calcMD && !calcNM)
		return;

	final Cigar cigar = record.getCigar();
	final List<CigarElement> cigarElements = cigar.getCigarElements();
	final byte[] seq = record.getReadBases();
	final int start = record.getAlignmentStart() - 1;
	int i, x, y, u = 0;
	int nm = 0;
	final StringBuilder str = new StringBuilder();

	final int size = cigarElements.size();
	for (i = y = 0, x = start; i < size; ++i) {
		final CigarElement ce = cigarElements.get(i);
		int j;
		final int length = ce.getLength();
		final CigarOperator op = ce.getOperator();
		if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ || op == CigarOperator.X) {
			for (j = 0; j < length; ++j) {
				final int z = y + j;

				if (refOffset + ref.length <= x + j)
					break; // out of boundary

				int c1 = 0;
				int c2 = 0;
				// try {
				c1 = seq[z];
				c2 = ref[x + j - refOffset];

				if ((c1 == c2) || c1 == 0) {
					// a match
					++u;
				} else {
					str.append(u);
					str.appendCodePoint(ref[x + j - refOffset]);
					u = 0;
					++nm;
				}
			}
			if (j < length)
				break;
			x += length;
			y += length;
		} else if (op == CigarOperator.DELETION) {
			str.append(u);
			str.append('^');
			for (j = 0; j < length; ++j) {
				if (ref[x + j - refOffset] == 0)
					break;
				str.appendCodePoint(ref[x + j - refOffset]);
			}
			u = 0;
			if (j < length)
				break;
			x += length;
			nm += length;
		} else if (op == CigarOperator.INSERTION || op == CigarOperator.SOFT_CLIP) {
			y += length;
			if (op == CigarOperator.INSERTION)
				nm += length;
		} else if (op == CigarOperator.SKIPPED_REGION) {
			x += length;
		}
	}
	str.append(u);

	if (calcMD)
		record.setAttribute(SAMTag.MD.name(), str.toString());
	if (calcNM)
		record.setAttribute(SAMTag.NM.name(), nm);
}