Java Code Examples for htsjdk.samtools.util.SequenceUtil#reverseComplement()

The following examples show how to use htsjdk.samtools.util.SequenceUtil#reverseComplement() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestUtilsForAssemblyBasedSVDiscovery.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public static AlignedContig fromPrimarySAMRecordString(final String samRecordStringWithExplicitTabEscapeSequenceAndNoXAField,
                                                       final boolean hasSATag) {
    final AlignmentInterval primaryAlignment = fromSAMRecordString(samRecordStringWithExplicitTabEscapeSequenceAndNoXAField, hasSATag);

    final String[] fields = samRecordStringWithExplicitTabEscapeSequenceAndNoXAField.split("\t");
    final String readName = fields[0];
    final int samFlag = Integer.valueOf( fields[1] ) ;
    final byte[] sequence = fields[9].getBytes();
    final boolean forwardStrand = SAMFlag.READ_REVERSE_STRAND.isUnset(samFlag);
    if (!forwardStrand) {
        SequenceUtil.reverseComplement(sequence);
    }

    if (hasSATag) {
        final String saTag = fields[11];
        final String[] supplements = saTag.substring(3 + saTag.indexOf(":Z:")).split(";");
        final List<AlignmentInterval> alignments = new ArrayList<>(supplements.length + 1);
        alignments.add(primaryAlignment);
        for (final String sup : supplements) {
            alignments.add( new AlignmentInterval(sup) );
        }
        return new AlignedContig(readName, sequence, alignments);
    } else {
        return new AlignedContig(readName, sequence, Collections.singletonList(primaryAlignment));
    }
}
 
Example 2
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Returns input read with alignment-related info cleared
 */
private static GATKRead clearReadAlignment(final GATKRead read, final SAMFileHeader header) {
    final GATKRead newRead = new SAMRecordToGATKReadAdapter(new SAMRecord(header));
    newRead.setName(read.getName());
    newRead.setBases(read.getBases());
    newRead.setBaseQualities(read.getBaseQualities());
    if (read.isReverseStrand()) {
        SequenceUtil.reverseComplement(newRead.getBases());
        SequenceUtil.reverseQualities(newRead.getBaseQualities());
    }
    newRead.setIsUnmapped();
    newRead.setIsPaired(read.isPaired());
    if (read.isPaired()) {
        newRead.setMateIsUnmapped();
        if (read.isFirstOfPair()) {
            newRead.setIsFirstOfPair();
        } else if (read.isSecondOfPair()) {
            newRead.setIsSecondOfPair();
        }
    }
    final String readGroup = read.getReadGroup();
    if (readGroup != null) {
        newRead.setAttribute(SAMTag.RG.name(), readGroup);
    }
    return newRead;
}
 
Example 3
Source File: BreakpointComplications.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static byte[] reverseComplementIfNecessary(final byte[] seq,
                                                   final AlignmentInterval first, final AlignmentInterval second,
                                                   final boolean firstAfterSecond) {
    if ( first.forwardStrand == second.forwardStrand ) { // reference order switch
        if (!first.forwardStrand) {
            SequenceUtil.reverseComplement(seq, 0, seq.length);
        }
    } else {
        if (firstAfterSecond == first.forwardStrand) {
            SequenceUtil.reverseComplement(seq, 0, seq.length);
        }
    }
    return seq;
}
 
Example 4
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Pairs reads with canonical Long ID using the lesser 64-bit hash of the sequence and its reverse complement
 */
@VisibleForTesting
static Tuple2<Long, GATKRead> canonicalizeRead(final GATKRead read) {
    final byte[] bases = read.getBases();
    final long hashForward = SVUtils.fnvByteArray64(bases);
    SequenceUtil.reverseComplement(bases);
    final long hashReverse = SVUtils.fnvByteArray64(bases);
    return new Tuple2<>(Math.min(hashForward, hashReverse), read);
}
 
Example 5
Source File: AnnotationUtilsTest.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
@Test
public void testDetermineVariantTranscriptEffect_reverseStrand() {
  Annotation transcript = new Annotation()
    .setReferenceName("1")
    .setStart(2L)
    .setEnd(20L)
    .setReverseStrand(true)
    .setTranscript(new Transcript()
      .setCodingSequence(new CodingSequence().setStart(3L).setEnd(18L))
      .setExons(ImmutableList.of(
          new Exon().setStart(2L).setEnd(7L).setFrame(2),
          new Exon().setStart(10L).setEnd(20L).setFrame(1))
      ));

  String bases = SequenceUtil.reverseComplement(
      // First exon [10, 20)
      "AC" + // 5' UTR
      "ATG" + "ACG" + "GT" +
      // intron
      "CCC" +
      // Second exon [2, 7)
      "G" + "TAG" +
      "G"); // 3' UTR
  assertEquals("ATG -> ACG (reverse complement), AA is M -> T",
      VariantEffect.NONSYNONYMOUS_SNP,
      AnnotationUtils.determineVariantTranscriptEffect(16L, "G", transcript, bases));
  assertEquals("TAG -> CAG (reverse complement), AA is STOP -> Q",
      VariantEffect.STOP_LOSS,
      AnnotationUtils.determineVariantTranscriptEffect(5L, "G", transcript, bases));
  assertNull("mutates intron",
      AnnotationUtils.determineVariantTranscriptEffect(9L, "C", transcript, bases));
  assertNull("mutates 5' UTR",
      AnnotationUtils.determineVariantTranscriptEffect(19L, "C", transcript, bases));
}
 
Example 6
Source File: ClippingUtility.java    From picard with MIT License 5 votes vote down vote up
/**
 *  Returns an array of bytes representing the bases in the read,
 *  reverse complementing them if the read is on the negative strand
 */
private static byte[] getReadBases(final SAMRecord read) {
    if (!read.getReadNegativeStrandFlag()) {
        return read.getReadBases();
    }
    else {
        final byte[] reverseComplementedBases = new byte[read.getReadBases().length];
        System.arraycopy(read.getReadBases(), 0, reverseComplementedBases, 0, reverseComplementedBases.length);
        SequenceUtil.reverseComplement(reverseComplementedBases);
        return reverseComplementedBases;
    }
}
 
Example 7
Source File: AdapterDescriptor.java    From Drop-seq with MIT License 5 votes vote down vote up
@Override
public String getSequence(SAMRecord rec) {
    final String attribute = (String) rec.getAttribute(binaryTag);
    if (reverseComplement) {
        return SequenceUtil.reverseComplement(attribute);
    } else {
        return attribute;
    }
}
 
Example 8
Source File: ArtifactPrior.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public ArtifactPrior getReverseComplement(){
    final double[] revCompPi = new double[F1R2FilterConstants.NUM_STATES];
    final String revCompRefContext = SequenceUtil.reverseComplement(referenceContext);
    for (final ArtifactState s : ArtifactState.values()){
        revCompPi[s.ordinal()] = pi[s.getRevCompState().ordinal()];
    }
    return new ArtifactPrior(revCompRefContext, revCompPi, numExamples, numAltExamples);
}
 
Example 9
Source File: CustomAdapterPair.java    From picard with MIT License 5 votes vote down vote up
CustomAdapterPair(final String fivePrime, final String threePrime) {
    this.threePrime = threePrime;
    this.threePrimeBytes = StringUtil.stringToBytes(threePrime);

    this.fivePrime = fivePrime;
    this.fivePrimeReadOrder = SequenceUtil.reverseComplement(fivePrime);
    this.fivePrimeBytes = StringUtil.stringToBytes(fivePrime);
    this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder);
}
 
Example 10
Source File: TrimSequenceTemplate.java    From Drop-seq with MIT License 5 votes vote down vote up
public TrimSequenceTemplate(final String barcode) {
	this.sequence = barcode;
	this.reverseComplement = SequenceUtil.reverseComplement(this.sequence);
	bases = StringUtil.stringToBytes(this.sequence);
	rcBases = StringUtil.stringToBytes(this.reverseComplement);
	this.ignoredBases = StringUtil.stringToBytes("Nn");
}
 
Example 11
Source File: TrimSequenceTemplate.java    From Drop-seq with MIT License 5 votes vote down vote up
public TrimSequenceTemplate(final String sequence, final String ignoredBases) {
	this.sequence = sequence;
	this.reverseComplement = SequenceUtil.reverseComplement(this.sequence);
	bases = StringUtil.stringToBytes(this.sequence);
	rcBases = StringUtil
			.stringToBytes(this.reverseComplement);
	this.ignoredBases = StringUtil
			.stringToBytes(ignoredBases);
}
 
Example 12
Source File: ArtifactPrior.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public String getRCContext() {
    return SequenceUtil.reverseComplement(referenceContext);
}
 
Example 13
Source File: RrbsMetricsCollector.java    From picard with MIT License 4 votes vote down vote up
public void acceptRecord(final SAMRecordAndReference args) {
	mappedRecordCount++;

	final SAMRecord samRecord = args.getSamRecord();
	final ReferenceSequence referenceSequence = args.getReferenceSequence();

	final byte[] readBases = samRecord.getReadBases();
	final byte[] readQualities = samRecord.getBaseQualities();
	final byte[] refBases = referenceSequence.getBases();

	if (samRecord.getReadLength() < minReadLength) {
		smallReadCount++;
		return;
	} else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) {
		mismatchCount++;
		return;
	}

	// We only record non-CpG C sites if there was at least one CpG in the read, keep track of
	// the values for this record and then apply to the global totals if valid
	int recordCpgs = 0;

	for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) {
		final int blockLength = alignmentBlock.getLength();
		final int refFragmentStart = alignmentBlock.getReferenceStart() - 1;
		final int readFragmentStart = alignmentBlock.getReadStart() - 1;

		final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength);
		final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength);
		final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength);

		if (samRecord.getReadNegativeStrandFlag()) {
			// In the case of a negative strand, reverse (and complement for base arrays) the reference,
			// reads & qualities so that it can be treated as a positive strand for the rest of the process
			SequenceUtil.reverseComplement(refFragment);
			SequenceUtil.reverseComplement(readFragment);
			SequenceUtil.reverseQualities(readQualityFragment);
		}

		for (int i=0; i < blockLength-1; i++) {
			final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag());

			// Look at a 2-base window to see if we're on a CpG site, and if so check for conversion
			// (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read
			if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) &&
					(SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) {
				// We want to catch the case where there's a CpG in the reference, even if it is not valid
				// to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all
				// in one if statement
				if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) {
					recordCpgs++;
					final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex);
					cpgTotal.increment(curLocation);
					if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
						cpgConverted.increment(curLocation);
					}
				}
				i++;
			} else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) &&
					SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) {
				// C base in the reference that's not associated with a CpG
				nCytoTotal++;
				if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
					nCytoConverted++;
				}
			}
		}
	}

	if (recordCpgs == 0) {
		noCpgCount++;
	}
}
 
Example 14
Source File: TestUtilsCpxVariantInference.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private static PreprocessedAndAnalysisReadyContigWithExpectedResults buildForContig_22672_4(final Set<String> canonicalChromosomes,
                                                                                            final SAMSequenceDictionary refSeqDict) {
    final AssemblyContigWithFineTunedAlignments
            analysisReadyContig = makeContigAnalysisReadyForCpxBranch("asm022672:tig00004\t16\tchr9\t130954964\t60\t1631S192M60I99M24I54M156I1430M\t*\t0\t0\tTCATCCAGGTTGGAGTGCAGTGGTGCTATCTCAGCTCACTGCAGCCTCCACCCCTGGATTGAATCGATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGAGTGCACCACCACGCCCGGCTCATGTTTGTGTTTTTAGTAGAGACAGGGTTTCACCACATTGGCCAGGCTGGTCTCAAACTCCTGACCTCAAGTGATCTGGCTGCCTTGGCCTCCCAAAGTGCTGGGATTATAGGTGTGAGCCACCACACCCAGCCAGAGTGGGTAGTTTTTAAAACCACCACAATTGCCCGCCAGGGGCACAAAGAGGACTCATGGGGATGGGTCTGATAAGTGCTGAGCCCAGTGCTGGGCACCTGCAGGCACTCAGTAGGTGGTGGACACTTGTTTTATTATGATTATCCAGCACCTAGCACCCAGTCTACCCCAAATAGCACCATCAACATATCTTCCTGAATCCTGCCTCCCTCATTTCAGCCTTTGCTCTCAGGCAGCCAGGGGTTCCCCATTTTCAACAAGAAAAAGCCCAAACCCTTTTGCTTGGCAGTCAACCCCACCCCATCCAAACCAATCCCTCCTTTCTCTGCTGAATGTTTCCCCTGTGCCAGCCACTTCCTTCCCACCACCACACCCTTAGCCAGGCTATCACCCACCAAGAATCCTGCCCCATCTTGAGCCTCTGCCTTCATTTGAAGCCTATCATCCCCTGTGCCTCAGTATTTTCCTTCAGCTCTAGATTCTTGCAGTGCTCTAGCCATTTACACAGCCATCAAATTCCAGGCCTCTTCTGTCACTCACCGGTTCACCTCTGTCATCTTGTCTCCTGCCCAAGACTGCATCTCCTCTTGTCCCTTCTCTGAGAGGCCTGTGGTTGAGCACAGGGATGAATGAGAAAGAAACCCAGCCTGTCCTCAAGAAGTTGACAATGTGTCCCTCAAAGACTGGGCTCATTGCTGCTACCTCTGGGAGTCCCAATGTGGAGGAGAGCTCTCAGTGGGTGTGCAGAAAATCTTGTCAGAAGTTACTGTCCCTGGGCAGGGCTAATACCACCAGTATCACCATTATCATCATCACCACTATCGTCATCATCACCACCATCACCATCATTACCACCATCATCACCATCAATATCGACACCATCCTCATCATCATCATCACCACCATCATCAGCATCATCATCAGCAGCAGCACCACCATTACCATCATTATCCCTACCATCTTCATCACCACCATCACCATCACCATCACCATCATCATCACCACCATCACCATCATCACCACCACCATCATCATCACCACCATCACCACCATCATCACCATCATCATCACCATCATCACCACCATCATCACCATCATCATCATCAGCACCACCACCACTATTACCATCATTATCACTACCATCCTCATCACCACCATCACTATCACCATCACCATCATCATCACCATCATCACCATCATCATCACCATCACCATCATCACCATCACCATCACCATCATCACCATCATCATCATCACCATCACCATCATCATCACCATCATCACCACCATCATCACCATCATCATCATCAGCACCACCACCACTATTACCATCATTATCCCTACCATCTTCATCACCACCATCACCATCACCATCACCATCATCATCACCACCATCACCATCACCATCACCATCATCATCACCATCATCACCACCATCATCACCATCATCATCATCAGCACCACCACCACTATTACCATCATTATCCCTACCATCCTCATCACCACCATCACCATCACCATCACCATCATCATCACCATCACCATCATCACCATCACCATCATCAACATCACCATCACCATCATCACCATCATCATCATCACCATCACCATCATCATCACCATCATCACCACCATCATCACCATCATCAGCAGCAGCAGCACCACCACCACTATTACCATCATTATCCCTACCATCCTCATCACCACCATCACCATCACCATCATCACCATCACCATCACCATCATCACCATCACCATCATCATCATCACCACCATCATCACCATCATCATCAGCAGCAGCACCACCACCACTATTACCATCATTATTCCTACCATCCTCATCACCACCCTCACCATCACCATCATCATCATGACCATCATCATCAGCACCACCGTTATCATCATTATCCCCACCGTCCTCATCACCATCATCACCATCACCATCATCATTGTCACCATCACCATGATCATCGTCACCATCTTTATCCCCACCATCCTCATCACCATCATCACTACCATCAGCACCATCACCATCATCACCATTACCACCATTATCACCACCATCATCATCACCAACACCATCCTCAGCACTACCACCACCATCACATTCTCATTCATCCCTGTGCTCAACCACAGGCCTCTCAGAGAAGGGACAAGAGGAGATGCAGTCTTGGGCAGGAGACCACTTTTGGGGGCAAAAGCAGCTCCTGAAACCACACCCCAAAGCAGGCTCCACTCCATTCCATCAGACCCTGCAGTCAGGAAGGGCCGTGGGGTGTCCTGGCCTTCATCTGTGAGAACTGCCTTACCCATGCTGATTTCCACCCACATGGCATCAGAGGACTCCGTGCCCTGACACTACAATCCTTGTCCCCTTGTGTAGCCACCTCAGGGTCCAGCACCAACTGTCCTGTCTACCTGGAGCATAACACCATGAGGTCCCCAGCTCCTGGCTCTCCCCTGGGGGCTTGCCATGACCCGTTGGCCTGAGCTTTGGGGTGTCAGAAGCCCCCATGGAATGCACTGCTAAGACAAGCCCATCCTGCCAGCCTTCCTACCTCTCATAGCTGACCCTGCTGGGTTCTATGTTACAAATTCCCCTGCCCCAGCACCACTCTGTGACCTGCAGTGAGCGCAGCTGTCACTCCCCTCAAGGGTGCCCAGGCAATAGGGAGATGTGAGGACTGTGGGGGCATAAGAGTTGTCTCAGGGCTGTGCAGAGGAAGCCAGGGCCCCCCTAAATATCTCCAGCTCATCGCACCAGCTCCATTCTGCCCGGAGCCCCATGTCTGAGCCCTCAGCTCAGAGCAGACAACCGTGCTAATCCTGTCCCAGGCTGGTGGCCGCCCCCACTGAGGAGGGGGAGCAAGCTGCCCAGAAAGCTGGTGTGGAGTGCCCGTTAGCTGTGGATGGCATCGTGGTGCCACCGGGAGATTATCCACACGCAGCTAGTTCCCAGCAGCCGACCCCAGTGCCCCAAAAGAAGGCAGCTAGCTATTAAGGAGATTCATGGGCAGGGGTGAGGCGAGGAGGAAGGCTAATCAAGCTGTCCTGATTGTAATTGTCCGAAGGTGCTGGCCTCTCTCGTAAACATGCCAACTGCAGGCCCTGCTTCTGCGTCTCAGCAGGACTTCTCCCTGCTGGGACCGTGCTGGGGAATGTTCTTTCAACATAACTCTATTTAAATTCACATTTCCATCATCCCCCAGAGGAGCCGAGAGAGCTGAGCTGCGTGGTATAAGCCGGGAAAGGATTAGATGGGGTGGGTGTTATTTTTTTTCCTTTTATTTTCCCTTAGTGATGGAGATGGGGTTGTGGGGGGTGGGCCATTCTAGAATTCTGCAGTATTGGAACTGGAAGAGCTGTTACAAACCATCCAGCTC\t*\tSA:Z:chr9,130953867,-,1274M42I167M2163H,60,55,1318;chr9,130955093,-,1469H33M3I179M1962H,19,14,138;\tMD:Z:15T1C5T11A0T3G0A2C4C3T8T5C5T2G8G5G2G6C24T45T2C23T5C14T46T33T5C32T1433\tRG:Z:GATKSVContigAlignments\tNM:i:268\tAS:i:1347\tXS:i:176",
            canonicalChromosomes, refSeqDict);

    final CpxVariantInducingAssemblyContig.BasicInfo manuallyCalculatedBasicInfo =
            new CpxVariantInducingAssemblyContig.BasicInfo("chr9", false,
                    new SimpleInterval("chr9", 130953867, 130953867), new SimpleInterval("chr9", 130956738, 130956738));

    //////////
    final List<CpxVariantInducingAssemblyContig.Jump> manuallyCalculatedJumps =
            Arrays.asList(
                    new CpxVariantInducingAssemblyContig.Jump(new SimpleInterval("chr9", 130955309, 130955309), new SimpleInterval("chr9", 130955308, 130955308), StrandSwitch.NO_SWITCH, 156),
                    new CpxVariantInducingAssemblyContig.Jump(new SimpleInterval("chr9", 130955156, 130955156), new SimpleInterval("chr9", 130955155, 130955155), StrandSwitch.NO_SWITCH, 60),
                    new CpxVariantInducingAssemblyContig.Jump(new SimpleInterval("chr9", 130954964, 130954964), new SimpleInterval("chr9", 130955307, 130955307), StrandSwitch.NO_SWITCH, 148)
            );
    final List<SimpleInterval> manuallyCalculatedEventPrimaryChromosomeSegmentingLocations =
            Arrays.asList(
                    new SimpleInterval("chr9", 130954964, 130954964),
                    new SimpleInterval("chr9", 130955155, 130955155),
                    new SimpleInterval("chr9", 130955156, 130955156),
                    new SimpleInterval("chr9", 130955307, 130955307),
                    new SimpleInterval("chr9", 130955308, 130955308),
                    new SimpleInterval("chr9", 130955309, 130955309)
            );
    final CpxVariantInducingAssemblyContig manuallyCalculatedCpxVariantInducingAssemblyContig =
            new CpxVariantInducingAssemblyContig(analysisReadyContig,
                    manuallyCalculatedBasicInfo,
                    manuallyCalculatedJumps,
                    manuallyCalculatedEventPrimaryChromosomeSegmentingLocations);

    //////////
    final SimpleInterval manuallyCalculatedAffectedRefRegion =
            new SimpleInterval("chr9", 130954964, 130955309);
    final List<SimpleInterval> manuallyCalculatedSegments =
            Arrays.asList(
                    new SimpleInterval("chr9", 130954964, 130955155),
                    new SimpleInterval("chr9", 130955156, 130955307),
                    new SimpleInterval("chr9", 130955307, 130955308));
    final List<String> manuallyCalculatedAltArrangementDescription =
            Arrays.asList("1", "2", "UINS-148", "1", "UINS-60", "2", "3", "UINS-156");
    final byte[] manuallyCalculatedAltSeq = Arrays.copyOfRange(analysisReadyContig.getContigSequence(), 1429, 2549);
    SequenceUtil.reverseComplement(manuallyCalculatedAltSeq);
    final CpxVariantCanonicalRepresentation manuallyCalculatedCpxVariantCanonicalRepresentation =
            new CpxVariantCanonicalRepresentation(
                    manuallyCalculatedAffectedRefRegion,
                    manuallyCalculatedSegments,
                    manuallyCalculatedAltArrangementDescription,
                    manuallyCalculatedAltSeq);

    //////////
    final byte[] dummyRefSequence = "T".getBytes();
    final VariantContextBuilder variantContextBuilder = new VariantContextBuilder()
            .chr("chr9").start(130954964).stop(130955309)
            .alleles(Arrays.asList(Allele.create(dummyRefSequence, true), Allele.create(SimpleSVType.createBracketedSymbAlleleString(CPX_SV_SYB_ALT_ALLELE_STR), false)))
            .id("CPX_chr9:130954964-130955309")
            .attribute(VCFConstants.END_KEY, 130955309)
            .attribute(SVTYPE, GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR)
            .attribute(SVLEN, 774)
            .attribute(SEQ_ALT_HAPLOTYPE, new String(manuallyCalculatedAltSeq));
    variantContextBuilder.attribute(CPX_EVENT_ALT_ARRANGEMENTS,
            String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, manuallyCalculatedAltArrangementDescription));
    variantContextBuilder.attribute(CPX_SV_REF_SEGMENTS,
            String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, manuallyCalculatedSegments.stream().map(SimpleInterval::toString).collect(Collectors.toList())));
    final VariantContext manuallyCalculatedVariantContext = variantContextBuilder.make();

    //////////
    final Set<SimpleInterval> manuallyCalculatedTwoBaseBoundaries = new HashSet<>();
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130955309, 130955310));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130956737, 130956738));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130955156, 130955157));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130955307, 130955308));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130954964, 130954965));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130955154, 130955155));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130953867, 130953868));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr9", 130955306, 130955307));

    //////////
    return new PreprocessedAndAnalysisReadyContigWithExpectedResults(
            manuallyCalculatedCpxVariantInducingAssemblyContig,
            manuallyCalculatedCpxVariantCanonicalRepresentation,
            manuallyCalculatedVariantContext,
            dummyRefSequence,
            manuallyCalculatedTwoBaseBoundaries);
}
 
Example 15
Source File: TestUtilsCpxVariantInference.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private static PreprocessedAndAnalysisReadyContigWithExpectedResults buildForContig_2548_1(final Set<String> canonicalChromosomes,
                                                                                           final SAMSequenceDictionary refSeqDict) {
    final AssemblyContigWithFineTunedAlignments
            analysisReadyContig = makeContigAnalysisReadyForCpxBranch("asm002548:tig00001\t16\tchr2\t16224630\t60\t513M634S\t*\t0\t0\tTCAGCACAATGTCTGCTATCCAGTCACTGCTCTGTATTAGCCTCGCCTTAAATTGCTCGAATAGTAAGCAAGGGTTTATAAAGCATCTGTGTGCTAATCCCATCCTAAAGTGACAGGGATCAATTTAACCTCCTCATTTAAAAGATGAGGAGACAGGCTCAAAGGGCAAAGCCGACTCACCCAGGCTCACACAACCAGAGTGGCAGAGCCTCAGGGCTTGCCCTGCACAGCGTAGCTTCTGTTCTGAGAGGAATGCACCTGGGTTTTGTGTCTTTGGTGCTGGGGTGGTGAATGGGAGGCTCTGACCGATGGCAGAGTTCCCAGGGGATCCTTCAGGCAGCCGGGGGGCCTTTCTTATGGCAGGGCTCTGAAGCAAGTCCTGGCTGACCTTTTCTTGAAGCTGAGCTCTTCTGCCAAGACCCTTGGCCCTGCCCTTGGCCCCTGTAGAAATGCACACCTGTAAGCCTGTAATACCTTCTCCAGATCACAGCAACTAGGGAACTGCTGGAAGCCACGGAGATCAGCAGGGACCGCACCATCCCCAGAAGTGTGGATCCACCCGAGGGTCACGTGGATCCTCAAGGAGTGGAGAGTAGACCCTGAGCCACGTGGGCTTCTAGCAGTTCCCTGGGCCTGCTGGCACCAAGATGGGCTCCCTGTCACAGGCCTGACGGGACCTCTGTGAGGGTCCCAGGAGATGAAGCACATGAAGAGGGTTGGTGCACCAAGCAGGCTCACCAAATATTCACTCCCTGACCCCTTTCCCCACGCAAGCTGTGGTGGGGAAGGAGTGTTTCATGGTGGGCAGCCCAGGTGCCAGTCCCCCCCTTCCCCAGTGATGGGGTACCATTACCCCCACGACAGCCCAGCTTGGTGCCAGCAGGCCCAGGGAACTGCTGGAAGCCCACGTGGCTCAGGTCTACTCCTCACTCCTTGAGGATCCACGTGGCCCTTGGGTGGATCCACACTTCTGGGGATGGTGCGGTCCCAGCTGATCTCCCTGGGGGTCCAGTTAGCCTCATCCTCCTTCACCCAGGGTGGCTGCATGTCAGCACCTGCTCTACCTTCTACCTTAATGGCGTCCCCTAGGCTATCTGTCAGGGGAGTCTATGCAAAGGGGCCCCTTTGAACTTGCCTGTGCCACTCC\t*\tSA:Z:chr2,16225125,-,887H260M,60,0,260;chr2,16226497,-,657H170M1D53M267H,60,14,141;chr2,16225125,+,518H29M1I84M515H,60,7,66;\tMD:Z:513\tRG:Z:GATKSVContigAlignments\tNM:i:0\tAS:i:513\tXS:i:0",
            canonicalChromosomes, refSeqDict);

    final CpxVariantInducingAssemblyContig.BasicInfo manuallyCalculatedBasicInfo =
            new CpxVariantInducingAssemblyContig.BasicInfo("chr2", false,
                    new SimpleInterval("chr2", 16224630, 16224630), new SimpleInterval("chr2", 16225384, 16225384));

    //////////
    final List<CpxVariantInducingAssemblyContig.Jump> manuallyCalculatedJumps =
            Arrays.asList(
                    new CpxVariantInducingAssemblyContig.Jump(new SimpleInterval("chr2", 16225125, 16225125), new SimpleInterval("chr2", 16226720, 16226720), StrandSwitch.NO_SWITCH, 7),
                    new CpxVariantInducingAssemblyContig.Jump(new SimpleInterval("chr2", 16226497, 16226497), new SimpleInterval("chr2", 16225125, 16225125), StrandSwitch.REVERSE_TO_FORWARD, 28),
                    new CpxVariantInducingAssemblyContig.Jump(new SimpleInterval("chr2", 16225237, 16225237), new SimpleInterval("chr2", 16225142, 16225142), StrandSwitch.FORWARD_TO_REVERSE, 2)
            );
    final List<SimpleInterval> manuallyCalculatedEventPrimaryChromosomeSegmentingLocations =
            Arrays.asList(
                    new SimpleInterval("chr2", 16225125, 16225125),
                    new SimpleInterval("chr2", 16225142, 16225142),
                    new SimpleInterval("chr2", 16225237, 16225237)
            );
    final CpxVariantInducingAssemblyContig manuallyCalculatedCpxVariantInducingAssemblyContig =
            new CpxVariantInducingAssemblyContig(analysisReadyContig,
                    manuallyCalculatedBasicInfo,
                    manuallyCalculatedJumps,
                    manuallyCalculatedEventPrimaryChromosomeSegmentingLocations);

    //////////
    final SimpleInterval manuallyCalculatedAffectedRefRegion =
            new SimpleInterval("chr2", 16225125, 16225237);
    final List<SimpleInterval> manuallyCalculatedSegments =
            Arrays.asList(
                    new SimpleInterval("chr2", 16225125, 16225142),
                    new SimpleInterval("chr2", 16225142, 16225237));
    final List<String> manuallyCalculatedAltArrangementDescription =
            Arrays.asList("1", "UINS-2", "-2", "-1", "UINS-28", "chr2:16226497-16226720", "UINS-7", "1", "2");
    final byte[] manuallyCalculatedAltSeq = Arrays.copyOfRange(analysisReadyContig.getContigSequence(), 147, 652);
    SequenceUtil.reverseComplement(manuallyCalculatedAltSeq);
    final CpxVariantCanonicalRepresentation manuallyCalculatedCpxVariantCanonicalRepresentation =
            new CpxVariantCanonicalRepresentation(
                    manuallyCalculatedAffectedRefRegion,
                    manuallyCalculatedSegments,
                    manuallyCalculatedAltArrangementDescription,
                    manuallyCalculatedAltSeq);

    //////////
    final byte[] dummyRefSequence = "T".getBytes();
    final VariantContextBuilder variantContextBuilder = new VariantContextBuilder()
            .chr("chr2").start(16225125).stop(16225237)
            .alleles(Arrays.asList(Allele.create(dummyRefSequence, true), Allele.create(SimpleSVType.createBracketedSymbAlleleString(CPX_SV_SYB_ALT_ALLELE_STR), false)))
            .id("CPX_chr2:16225125-16225237")
            .attribute(VCFConstants.END_KEY, 16225237)
            .attribute(SVTYPE, GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR)
            .attribute(SVLEN, 392)
            .attribute(SEQ_ALT_HAPLOTYPE, new String(manuallyCalculatedAltSeq));
    variantContextBuilder.attribute(CPX_EVENT_ALT_ARRANGEMENTS,
            String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, manuallyCalculatedAltArrangementDescription));
    variantContextBuilder.attribute(CPX_SV_REF_SEGMENTS,
            String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, manuallyCalculatedSegments.stream().map(SimpleInterval::toString).collect(Collectors.toList())));
    final VariantContext manuallyCalculatedVariantContext = variantContextBuilder.make();

    //////////
    final Set<SimpleInterval> manuallyCalculatedTwoBaseBoundaries = new HashSet<>();
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr2", 16225125, 16225126));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr2", 16225383, 16225384));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr2", 16225236, 16225237));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr2", 16224630, 16224631));
    manuallyCalculatedTwoBaseBoundaries.add(new SimpleInterval("chr2", 16225141, 16225142));

    //////////
    return new PreprocessedAndAnalysisReadyContigWithExpectedResults(
            manuallyCalculatedCpxVariantInducingAssemblyContig,
            manuallyCalculatedCpxVariantCanonicalRepresentation,
            manuallyCalculatedVariantContext,
            dummyRefSequence,
            manuallyCalculatedTwoBaseBoundaries);
}
 
Example 16
Source File: SingleContigReferenceAlignerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test(dataProvider = "testAlignmentData")
public void testAlignment(final boolean pairAlignment, final byte[] reference, final String referenceName)
    throws IOException
{
    try (final SingleSequenceReferenceAligner<byte[], AlignedContig> aligner =  SingleSequenceReferenceAligner.contigsAligner(referenceName, reference,
            a -> "ctg", a -> a);) {
        Assert.assertNotNull(aligner.getAligner());
        if (pairAlignment) {
            aligner.getAligner().alignPairs();
        }
        final Random rdn = new Random(13111);
        final RandomDNA rdnDNA = new RandomDNA(rdn);
        final List<List<AlignmentInterval>> expected = new ArrayList<>(NUM_ALIGNS << 1);
        final List<byte[]> seqs = new ArrayList<>(NUM_ALIGNS << 1);
        for (int i = 0; i < NUM_ALIGNS; i++) {
            final int start = rdn.nextInt(reference.length - READ_LENGTH) + 1;
            final boolean forward = rdn.nextBoolean();
            final boolean insert = rdn.nextDouble() < 0.1;
            final boolean deletion = !insert && rdn.nextDouble() < 0.1;
            final int indelLength = insert || deletion ? rdn.nextInt(10) + 10 : 0;
            final int indelStart = insert || deletion ? rdn.nextInt((int) (READ_LENGTH  * .50)) + 25 : -1;
            final int end = start + READ_LENGTH - 1;
            final byte[] templateSeq = Arrays.copyOfRange(reference, start - 1, end);
            final byte[] actualSeq;
            if (insert) {
                actualSeq = Arrays.copyOf(templateSeq, templateSeq.length + indelLength);
                System.arraycopy(actualSeq, indelStart - 1, actualSeq, indelStart - 1 + indelLength, templateSeq.length - indelStart + 1);
                rdnDNA.nextBases(actualSeq, indelStart - 1, indelLength);
            } else if (deletion) {
                actualSeq = Arrays.copyOf(templateSeq, templateSeq.length - indelLength);
                System.arraycopy(templateSeq, indelStart - 1 + indelLength, actualSeq, indelStart - 1, templateSeq.length - indelStart + 1 - indelLength);
            } else {
                actualSeq = templateSeq.clone();
            }

            while (insert && actualSeq[indelStart - 1] == actualSeq[indelStart + indelLength - 1]) {
                actualSeq[indelStart + indelLength - 1] = rdnDNA.nextBase();
            }
            if (!forward) {
                SequenceUtil.reverseComplement(actualSeq);
            }
            seqs.add(actualSeq);
            final Cigar cigar = (!insert && !deletion) ? TextCigarCodec.decode(READ_LENGTH + "M"):
                    (insert ? TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "I" + (READ_LENGTH - indelStart + 1) + "M")
                            : TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "D" + (READ_LENGTH - indelStart + 1 - indelLength) + "M"));

            expected.add(Collections.singletonList(new AlignmentInterval(new SimpleInterval(referenceName, start, end), 1, actualSeq.length, !forward ? CigarUtils.invertCigar(cigar) : cigar
                    , forward, 0, 0, 0, null)));
        }
        final List<AlignedContig> results = aligner.align(seqs);
        final Map<byte[], AlignedContig> mapResult = aligner.align(seqs, (b, a) -> b);
        Assert.assertEquals(results, new ArrayList<>(mapResult.values()));
        Assert.assertEquals(new ArrayList<>(mapResult.keySet()), mapResult.values().stream().map(AlignedContig::getContigSequence).collect(Collectors.toList()));
        for (int i = 0; i < NUM_ALIGNS; i++) {
            final List<AlignmentInterval> actualValue = results.get(i).getAlignments();
            final List<AlignmentInterval> expectedValue = expected.get(i);
            Assert.assertEquals(actualValue.size(), 1);
            Assert.assertEquals(actualValue.get(0).forwardStrand, expectedValue.get(0).forwardStrand);
            Assert.assertEquals(actualValue.get(0).referenceSpan, expectedValue.get(0).referenceSpan, expectedValue.get(0).cigarAlong5to3DirectionOfContig.toString());
            Assert.assertEquals(actualValue.get(0).startInAssembledContig, expectedValue.get(0).startInAssembledContig);
            Assert.assertEquals(actualValue.get(0).endInAssembledContig, expectedValue.get(0).endInAssembledContig);
            final Cigar expectedCigar = expectedValue.get(0).cigarAlong5to3DirectionOfContig;
            final Cigar actualCigar = actualValue.get(0).cigarAlong5to3DirectionOfContig;
            if (!expectedCigar.equals(actualCigar)) { // small differences may occur due to ambiguous indel location. So we check that they are small differences indeed:
                Assert.assertEquals(expectedCigar.numCigarElements(), actualCigar.numCigarElements()); // same number of elements
                Assert.assertEquals(expectedCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList()),
                        actualCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList())); // same operators sequence.
                // then we check the total lengths per operator (must be the same):
                final Map<CigarOperator, Integer> expectedLengthByOperator = expectedCigar.getCigarElements().stream()
                        .collect(Collectors.groupingBy(CigarElement::getOperator,
                                Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b)));
                final Map<CigarOperator, Integer> actualLengthByOperator = actualCigar.getCigarElements().stream()
                        .collect(Collectors.groupingBy(CigarElement::getOperator,
                                Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b)));
                Assert.assertEquals(actualLengthByOperator, expectedLengthByOperator);
                // finally we don't allow more than 5 bases length difference for any given element.
                for (int j = 0; j < expectedCigar.numCigarElements(); j++) {
                    Assert.assertTrue(Math.abs(expectedCigar.getCigarElement(j).getLength() - actualCigar.getCigarElement(j).getLength()) < 10, "actual: " + actualCigar + " != expected: " + expectedCigar);
                }
            }
        }
    }
}
 
Example 17
Source File: AnnotationUtils.java    From dataflow-java with Apache License 2.0 4 votes vote down vote up
/**
 * Determines the effect of the given allele at the given position within a
 * transcript. Currently, this routine is relatively primitive: it only works
 * with SNPs and only computes effects on coding regions of the transcript.
 * This utility currently does not handle any splice sites well, including
 * splice site disruption and codons which span two exons.
 *
 * @param variantStart 0-based absolute start for the variant.
 * @param allele Bases to substitute at {@code variantStart}.
 * @param transcript The affected transcript.
 * @param transcriptBases The reference bases which span the provided
 *     {@code transcript}. Must match the exact length of the
 *     {@code transcript.position}.
 * @return The effect of this variant on the given transcript, or null if
 *     unknown.
 */
public static VariantEffect determineVariantTranscriptEffect(
    long variantStart, String allele, Annotation transcript, String transcriptBases) {
  long txLen = transcript.getEnd() - transcript.getStart();
  Preconditions.checkArgument(transcriptBases.length() == txLen,
      "transcriptBases must have equal length to the transcript; got " +
          transcriptBases.length() + " and " + txLen + ", respectively");
  if (allele.length() != 1) {
    LOG.fine("determineVariantTranscriptEffects() only supports SNPs, ignoring " + allele);
    return null;
  }
  if (transcript.getTranscript().getCodingSequence() == null) {
    LOG.fine("determineVariantTranscriptEffects() only supports intersection with coding " +
        "transcript, ignoring ");
    return null;
  }

  long variantEnd = variantStart + 1;
  Range<Long> variantRange = Range.closedOpen(variantStart, variantEnd);
  Range<Long> codingRange = Range.closedOpen(
      transcript.getTranscript().getCodingSequence().getStart(),
      transcript.getTranscript().getCodingSequence().getEnd());
  if (Boolean.TRUE.equals(transcript.getReverseStrand())) {
    allele = SequenceUtil.reverseComplement(allele);
  }
  for (Exon exon : transcript.getTranscript().getExons()) {
    // For now, only compute effects on variants within the coding region of an exon.
    Range<Long> exonRange = Range.closedOpen(exon.getStart(), exon.getEnd());
    if (exonRange.isConnected(codingRange) &&
        exonRange.intersection(codingRange).isConnected(variantRange) &&
        !exonRange.intersection(codingRange).intersection(variantRange).isEmpty()) {
      // Get the bases which correspond to this exon.
      int txOffset = transcript.getStart().intValue();
      String exonBases = transcriptBases.substring(
          exon.getStart().intValue() - txOffset, exon.getEnd().intValue() - txOffset);
      int variantExonOffset = (int) (variantStart - exon.getStart());

      if (Boolean.TRUE.equals(transcript.getReverseStrand())) {
        // Normalize the offset and bases to 5' -> 3'.
        exonBases = SequenceUtil.reverseComplement(exonBases);
        variantExonOffset = (int) (exon.getEnd() - variantEnd);
      }

      // Determine the indices for the codon which contains this variant.
      if (exon.getFrame() == null) {
        LOG.fine("exon lacks frame data, cannot determine effect");
        return null;
      }
      int offsetWithinCodon = (variantExonOffset + exon.getFrame()) % 3;
      int codonExonOffset = variantExonOffset - offsetWithinCodon;
      if (codonExonOffset < 0 || exonBases.length() <= codonExonOffset+3) {
        LOG.fine("variant codon spans multiple exons, this case is not yet handled");
        return null;
      }
      String fromCodon = exonBases.substring(codonExonOffset, codonExonOffset+3);
      String toCodon =
          fromCodon.substring(0, offsetWithinCodon) + allele +
          fromCodon.substring(offsetWithinCodon + 1);
      return codonChangeToEffect(fromCodon, toCodon);
    }
  }
  return null;
}
 
Example 18
Source File: PSFilterTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test
public void testCanonicalizeRead() {

    final GATKRead read_1 = ArtificialReadUtils.createRandomRead(100);
    read_1.setBases(ArtificialReadUtils.createRandomReadBases(100, true));

    final GATKRead read_2 = read_1.copy();
    final byte[] bases_2 = read_1.getBases();
    SequenceUtil.reverseComplement(bases_2);
    read_2.setBases(bases_2);

    final GATKRead read_3 = read_1.copy();

    final GATKRead read_4 = ArtificialReadUtils.createRandomRead(100);
    read_4.setBases(ArtificialReadUtils.createRandomReadBases(100, true));

    final GATKRead read_5 = ArtificialReadUtils.createRandomRead(100);
    final byte[] bases_3 = read_4.getBases();
    switch (bases_3[bases_3.length - 1]) {
        case 'A':
            bases_3[bases_3.length - 1] = 'G';
            break;
        default:
            bases_3[bases_3.length - 1] = 'A';
            break;
    }
    read_5.setBases(bases_3);

    Tuple2<Long, GATKRead> result_1 = PSFilter.canonicalizeRead(read_1);
    Tuple2<Long, GATKRead> result_2 = PSFilter.canonicalizeRead(read_2);
    Tuple2<Long, GATKRead> result_3 = PSFilter.canonicalizeRead(read_3);
    Tuple2<Long, GATKRead> result_4 = PSFilter.canonicalizeRead(read_4);
    Tuple2<Long, GATKRead> result_5 = PSFilter.canonicalizeRead(read_5);
    Assert.assertEquals(result_1._1, result_2._1);
    Assert.assertEquals(result_1._1, result_3._1);
    Assert.assertNotEquals(result_1._1, result_4._1);
    Assert.assertNotEquals(result_4._1, result_5._1);

    Assert.assertEquals(read_1, result_1._2);
    Assert.assertEquals(read_2, result_2._2);
    Assert.assertEquals(read_3, result_3._2);
    Assert.assertEquals(read_4, result_4._2);
    Assert.assertEquals(read_5, result_5._2);
}