htsjdk.samtools.util.SequenceUtil Java Examples

The following examples show how to use htsjdk.samtools.util.SequenceUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: QualityScoreDistributionSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Adds a read to this count object by increasing counts of the base qualities.
 */
Counts addRead(final GATKRead read) {
    final byte[] bases = read.getBases();
    final byte[] quals = read.getBaseQualities();
    final byte[] oq    = ReadUtils.getOriginalBaseQualities(read);

    final int length = quals.length;

    for (int i=0; i<length; ++i) {
        if (includeNoCalls || !SequenceUtil.isNoCall(bases[i])) {
            qCounts[quals[i]]++;
            if (oq != null) {
                oqCounts[oq[i]]++;
            }
        }
    }
    return this;
}
 
Example #2
Source File: TrimSequenceTemplate.java    From Drop-seq with MIT License 6 votes vote down vote up
/**
 * Test to see if this read matches this barcode. If any base of a barcode
 * starts with N or n, then ignore that position.
 *
 * @param testString
 *            The read to look for this barcode in. The barcode should be at
 *            the start of the read for this method.  The entire barcode is expected for a match.
 * @return true if this barcode is found in the read.
 */
public boolean hasForwardMatch(final String testString) {
	byte[] testBases = StringUtil.stringToBytes(testString);
	int numBasesCanMatch = 0;
	int numBasesMatch = 0;
	for (int i = 0; i < bases.length; i++) {
		if (isIgnoreBase(this.bases[i]))
			continue;
		numBasesCanMatch++;
		if (SequenceUtil.basesEqual(testBases[i], bases[i]))
			numBasesMatch++;
	}
	if (numBasesCanMatch == numBasesMatch)
		return (true);
	return false;
}
 
Example #3
Source File: TestUtilsForAssemblyBasedSVDiscovery.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public static AlignedContig fromPrimarySAMRecordString(final String samRecordStringWithExplicitTabEscapeSequenceAndNoXAField,
                                                       final boolean hasSATag) {
    final AlignmentInterval primaryAlignment = fromSAMRecordString(samRecordStringWithExplicitTabEscapeSequenceAndNoXAField, hasSATag);

    final String[] fields = samRecordStringWithExplicitTabEscapeSequenceAndNoXAField.split("\t");
    final String readName = fields[0];
    final int samFlag = Integer.valueOf( fields[1] ) ;
    final byte[] sequence = fields[9].getBytes();
    final boolean forwardStrand = SAMFlag.READ_REVERSE_STRAND.isUnset(samFlag);
    if (!forwardStrand) {
        SequenceUtil.reverseComplement(sequence);
    }

    if (hasSATag) {
        final String saTag = fields[11];
        final String[] supplements = saTag.substring(3 + saTag.indexOf(":Z:")).split(";");
        final List<AlignmentInterval> alignments = new ArrayList<>(supplements.length + 1);
        alignments.add(primaryAlignment);
        for (final String sup : supplements) {
            alignments.add( new AlignmentInterval(sup) );
        }
        return new AlignedContig(readName, sequence, alignments);
    } else {
        return new AlignedContig(readName, sequence, Collections.singletonList(primaryAlignment));
    }
}
 
Example #4
Source File: ReadBaseStratification.java    From picard with MIT License 6 votes vote down vote up
@Override
public Integer stratify(final SAMRecord sam) {
    int numberOfNsInRead = 0;

    // Check to see if we've already seen this record before.  If not, count the number
    // of Ns in the read, and store it in a transient attribute.  If we have seen it
    // before, simply retrieve the value and avoid doing the computation again.
    if (sam.getTransientAttribute(numberOfNsTag) != null) {
        numberOfNsInRead = (Integer) sam.getTransientAttribute(numberOfNsTag);
    } else {
        final byte[] bases = sam.getReadBases();
        for (final byte base : bases) {
            if (SequenceUtil.isNoCall(base)) {
                numberOfNsInRead++;
            }
        }
        sam.setTransientAttribute(numberOfNsTag, numberOfNsInRead);
    }
    return numberOfNsInRead;
}
 
Example #5
Source File: GcBiasUtils.java    From picard with MIT License 6 votes vote down vote up
public static int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) {
    if (state.init) {
        state.init = false;
        state.gcCount = 0;
        state.nCount = 0;
        for (int i = startIndex; i < endIndex; ++i) {
            final byte base = bases[i];
            if (SequenceUtil.basesEqual(base, (byte)'G') || SequenceUtil.basesEqual(base, (byte)'C')) ++state.gcCount;
            else if (SequenceUtil.basesEqual(base, (byte)'N')) ++state.nCount;
        }
    } else {
        final byte newBase = bases[endIndex - 1];
        if (SequenceUtil.basesEqual(newBase, (byte)'G') || SequenceUtil.basesEqual(newBase, (byte)'C')) ++state.gcCount;
        else if (newBase == 'N') ++state.nCount;

        if (SequenceUtil.basesEqual(state.priorBase, (byte)'G') || SequenceUtil.basesEqual(state.priorBase, (byte)'C')) --state.gcCount;
        else if (SequenceUtil.basesEqual(state.priorBase, (byte)'N')) --state.nCount;
    }
    state.priorBase = bases[startIndex];
    if (state.nCount > 4) return -1;
    else return (state.gcCount * 100) / (endIndex - startIndex);
}
 
Example #6
Source File: AdapterUtility.java    From picard with MIT License 6 votes vote down vote up
/**
 * Checks the first ADAPTER_MATCH_LENGTH bases of the read against known adapter sequences and returns
 * true if the read matches an adapter sequence with MAX_ADAPTER_ERRORS mismsatches or fewer.
 *
 * @param read the basecalls for the read in the order and orientation the machine read them
 * @param  revCompRead When aligned reads are checked for being adapter, this specified if the original
 *                     read had ben rev-comped during alignment.
 * @return true if the read matches an adapter and false otherwise
 */
public boolean isAdapterSequence(final byte[] read, boolean revCompRead) {
    if (read.length < ADAPTER_MATCH_LENGTH) return false;

    for (final byte[] adapter : adapterKmers) {
        int errors = 0;

        for (int i = 0; i < adapter.length; ++i) {
            final byte base = revCompRead ? SequenceUtil.complement(read[read.length - i - 1]) : read[i];
            if (!SequenceUtil.basesEqual(base, adapter[i]) && ++errors > MAX_ADAPTER_ERRORS) {
                break;
            }
        }

        if (errors <= MAX_ADAPTER_ERRORS) {
            return true;
        }
    }

    return false;
}
 
Example #7
Source File: RnaSeqMetricsCollector.java    From picard with MIT License 6 votes vote down vote up
public static OverlapDetector<Interval> makeOverlapDetector(final File samFile, final SAMFileHeader header, final File ribosomalIntervalsFile, final Log log) {

        final OverlapDetector<Interval> ribosomalSequenceOverlapDetector = new OverlapDetector<Interval>(0, 0);
        if (ribosomalIntervalsFile != null) {

            final IntervalList ribosomalIntervals = IntervalList.fromFile(ribosomalIntervalsFile);
            if (ribosomalIntervals.size() == 0) {
                log.warn("The RIBOSOMAL_INTERVALS file, " + ribosomalIntervalsFile.getAbsolutePath() + " does not contain intervals");
            }
            try {
                SequenceUtil.assertSequenceDictionariesEqual(header.getSequenceDictionary(), ribosomalIntervals.getHeader().getSequenceDictionary());
            } catch (SequenceUtil.SequenceListsDifferException e) {
                throw new PicardException("Sequence dictionaries differ in " + samFile.getAbsolutePath() + " and " + ribosomalIntervalsFile.getAbsolutePath(),
                        e);
            }
            final IntervalList uniquedRibosomalIntervals = ribosomalIntervals.uniqued();
            final List<Interval> intervals = uniquedRibosomalIntervals.getIntervals();
            ribosomalSequenceOverlapDetector.addAll(intervals, intervals);
        }
        return ribosomalSequenceOverlapDetector;
    }
 
Example #8
Source File: QualityScoreDistribution.java    From picard with MIT License 6 votes vote down vote up
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
    // Skip unwanted records
    if (PF_READS_ONLY && rec.getReadFailsVendorQualityCheckFlag()) return;
    if (ALIGNED_READS_ONLY && rec.getReadUnmappedFlag()) return;
    if (rec.isSecondaryOrSupplementary()) return;

    final byte[] bases = rec.getReadBases();
    final byte[] quals = rec.getBaseQualities();
    final byte[] oq    = rec.getOriginalBaseQualities();

    final int length = quals.length;

    for (int i=0; i<length; ++i) {
        if (INCLUDE_NO_CALLS || !SequenceUtil.isNoCall(bases[i])) {
            qCounts[quals[i]]++;
            if (oq != null) oqCounts[oq[i]]++;
        }
    }
}
 
Example #9
Source File: SingleBarcodeDistanceMetric.java    From picard with MIT License 6 votes vote down vote up
public int hammingDistance() {
    int numMismatches = 0;
    for (int i = 0; i < barcodeBases.length && i < readBases.length && numMismatches <= maximalInterestingDistance; ++i) {

        if (SequenceUtil.isNoCall(readBases[i])) {
            continue;
        }

        if (!SequenceUtil.basesEqual(barcodeBases[i], readBases[i])) {
            ++numMismatches;
            continue;
        }

        // bases are equal but if quality is low we still penalize.
        // TODO: is it actually useful to penalize barcodes in this way?
        if (readQualities != null && readQualities[i] < minimumBaseQuality) {
            ++numMismatches;
        }
    }
    return numMismatches;
}
 
Example #10
Source File: ReferenceRegionTest.java    From cramtools with Apache License 2.0 6 votes vote down vote up
@Test
public void test_HangingEnd() {
	int start = 2;

	ReferenceRegion region = new ReferenceRegion(data, 0, "chr1", start);
	Assert.assertEquals(start, region.alignmentStart);
	Assert.assertEquals(0, region.arrayPosition(start));

	for (int pos = start, index = 0; pos < start + 10; pos++, index++) {
		Assert.assertEquals(data[index], region.base(pos));
	}

	String expectedMD5 = SequenceUtil.calculateMD5String(data, 0, data.length);
	String md5 = region.md5(start, data.length + 10);
	Assert.assertEquals(expectedMD5, md5);

	md5 = region.md5(start, data.length + 100);
	Assert.assertEquals(expectedMD5, md5);
}
 
Example #11
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Returns input read with alignment-related info cleared
 */
private static GATKRead clearReadAlignment(final GATKRead read, final SAMFileHeader header) {
    final GATKRead newRead = new SAMRecordToGATKReadAdapter(new SAMRecord(header));
    newRead.setName(read.getName());
    newRead.setBases(read.getBases());
    newRead.setBaseQualities(read.getBaseQualities());
    if (read.isReverseStrand()) {
        SequenceUtil.reverseComplement(newRead.getBases());
        SequenceUtil.reverseQualities(newRead.getBaseQualities());
    }
    newRead.setIsUnmapped();
    newRead.setIsPaired(read.isPaired());
    if (read.isPaired()) {
        newRead.setMateIsUnmapped();
        if (read.isFirstOfPair()) {
            newRead.setIsFirstOfPair();
        } else if (read.isSecondOfPair()) {
            newRead.setIsSecondOfPair();
        }
    }
    final String readGroup = read.getReadGroup();
    if (readGroup != null) {
        newRead.setAttribute(SAMTag.RG.name(), readGroup);
    }
    return newRead;
}
 
Example #12
Source File: GcBiasMetricsCollector.java    From picard with MIT License 6 votes vote down vote up
private void addRead(final GcObject gcObj, final SAMRecord rec, final String group, final byte[] gc, final byte[] refBases) {
    if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcObj.totalClusters;
    final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - scanWindowSize : rec.getAlignmentStart();
    ++gcObj.totalAlignedReads;
    if (pos > 0) {
        final int windowGc = gc[pos];
        if (windowGc >= 0) {
            ++gcObj.readsByGc[windowGc];
            gcObj.basesByGc[windowGc] += rec.getReadLength();
            gcObj.errorsByGc[windowGc] +=
                    SequenceUtil.countMismatches(rec, refBases, bisulfite) +
                            SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec);
        }
    }
    if (gcObj.group == null) {
        gcObj.group = group;
    }
}
 
Example #13
Source File: Read.java    From cramtools with Apache License 2.0 6 votes vote down vote up
void reset(EvidenceRecord evidenceRecord) {
	this.evidenceRecord = evidenceRecord;

	negative = "-".equals(evidenceRecord.Strand);

	baseBuf.clear();
	scoreBuf.clear();
	if (evidenceRecord.Strand.equals("+")) {
		baseBuf.put(evidenceRecord.Sequence.getBytes());
		scoreBuf.put(SAMUtils.fastqToPhred(evidenceRecord.Scores));
	} else {
		byte[] bytes = evidenceRecord.Sequence.getBytes();
		SequenceUtil.reverseComplement(bytes);
		baseBuf.put(bytes);
		bytes = SAMUtils.fastqToPhred(evidenceRecord.Scores);
		SequenceUtil.reverseQualities(bytes);
		scoreBuf.put(bytes);
	}
	baseBuf.flip();
	scoreBuf.flip();

	firstHalf.clear();
	secondHalf.clear();
}
 
Example #14
Source File: ReferenceRegionTest.java    From cramtools with Apache License 2.0 6 votes vote down vote up
@Test
public void test_Start_1() {
	int start = 1;

	ReferenceRegion region = new ReferenceRegion(data, 0, "chr1", start);
	Assert.assertEquals(start, region.alignmentStart);
	Assert.assertEquals(0, region.arrayPosition(start));

	for (int pos = start, index = 0; pos < start + 10; pos++, index++) {
		Assert.assertEquals(data[index], region.base(pos));
	}

	int len = 10;
	String expectedMD5 = SequenceUtil.calculateMD5String(data, 0, len);
	String md5 = region.md5(start, len);
	Assert.assertEquals(expectedMD5, md5);
}
 
Example #15
Source File: SVFastqUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public FastqRead( final GATKRead read, final boolean includeMappingLocation ) {
    this.header = composeHeaderLine(read, includeMappingLocation);
    this.bases = read.getBases();
    this.quals = read.getBaseQualities();
    if (!read.isUnmapped() && read.isReverseStrand()) {
        SequenceUtil.reverseComplement(this.bases);
        SequenceUtil.reverseQualities(this.quals);
    }
}
 
Example #16
Source File: SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Iterates through the input {@code noSecondaryAlignments}, which are assumed to contain no secondary alignment (i.e. records with "XA" tag),
 * converts to custom {@link AlignmentInterval} format and
 * split the records when the gap in the alignment reaches the specified {@code sensitivity}.
 * The size of the returned iterable of {@link AlignmentInterval}'s is guaranteed to be no lower than that of the input iterable.
 */
@VisibleForTesting
public static AlignedContig parseReadsAndOptionallySplitGappedAlignments(final Iterable<SAMRecord> noSecondaryAlignments,
                                                                         final int gapSplitSensitivity,
                                                                         final boolean splitGapped) {

    Utils.validateArg(noSecondaryAlignments.iterator().hasNext(), "input collection of GATK reads is empty");

    final SAMRecord primaryAlignment
            = Utils.stream(noSecondaryAlignments).filter(sam -> !sam.getSupplementaryAlignmentFlag())
            .findFirst()
            .orElseThrow(() -> new GATKException("no primary alignment for read " + noSecondaryAlignments.iterator().next().getReadName()));

    Utils.validate(!primaryAlignment.getCigar().containsOperator(CigarOperator.H),
            "assumption that primary alignment does not contain hard clipping is invalid for read: " + primaryAlignment.toString());

    final byte[] contigSequence = primaryAlignment.getReadBases().clone();
    final List<AlignmentInterval> parsedAlignments;
    if ( primaryAlignment.getReadUnmappedFlag() ) { // the Cigar
        parsedAlignments = Collections.emptyList();
    } else {
        if (primaryAlignment.getReadNegativeStrandFlag()) {
            SequenceUtil.reverseComplement(contigSequence);
        }

        final Stream<AlignmentInterval> unSplitAIList = Utils.stream(noSecondaryAlignments).map(AlignmentInterval::new);
        if (splitGapped) {
            final int unClippedContigLength = primaryAlignment.getReadLength();
            parsedAlignments = unSplitAIList.map(ar ->
                    ContigAlignmentsModifier.splitGappedAlignment(ar, gapSplitSensitivity, unClippedContigLength))
                    .flatMap(Utils::stream).collect(Collectors.toList());
        } else {
            parsedAlignments = unSplitAIList.collect(Collectors.toList());
        }
    }
    return new AlignedContig(primaryAlignment.getReadName(), contigSequence, parsedAlignments);
}
 
Example #17
Source File: BreakpointComplications.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static byte[] reverseComplementIfNecessary(final byte[] seq,
                                                   final AlignmentInterval first, final AlignmentInterval second,
                                                   final boolean firstAfterSecond) {
    if ( first.forwardStrand == second.forwardStrand ) { // reference order switch
        if (!first.forwardStrand) {
            SequenceUtil.reverseComplement(seq, 0, seq.length);
        }
    } else {
        if (firstAfterSecond == first.forwardStrand) {
            SequenceUtil.reverseComplement(seq, 0, seq.length);
        }
    }
    return seq;
}
 
Example #18
Source File: AssemblyBasedSVDiscoveryTestDataProviderForSimpleSV.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * 100-'A' + 100-'T' and a 50 bases of 'C' is inserted at the A->T junction point (forward strand description)
 * Return a list of two entries for positive and reverse strand representations.
 */
private static List<TestDataForSimpleSV>
forSimpleInsertion() {
    final List<TestDataForSimpleSV> result = new ArrayList<>();

    // simple insertion '+' strand representation
    final String leftRefFlank = TestUtilsForAssemblyBasedSVDiscovery.makeDummySequence('A', 100);
    final String insertedSeq  = TestUtilsForAssemblyBasedSVDiscovery.makeDummySequence('C', 50);
    final String rightRefFlank = TestUtilsForAssemblyBasedSVDiscovery.makeDummySequence('T', 100);
    byte[] contigSeq = (leftRefFlank + insertedSeq + rightRefFlank).getBytes();
    String contigName = "simple_ins_+";

    final SimpleInterval expectedLeftBreakpoint = new SimpleInterval("21:17000100-17000100");
    final SimpleInterval expectedRightBreakpoint = new SimpleInterval("21:17000100-17000100");
    final BreakpointComplications expectedBreakpointComplications = new BreakpointComplications.SimpleInsDelOrReplacementBreakpointComplications("", insertedSeq);
    final byte[] expectedAltSeq = insertedSeq.getBytes();
    final NovelAdjacencyAndAltHaplotype expectedNovelAdjacencyAndAltHaplotype = new NovelAdjacencyAndAltHaplotype(expectedLeftBreakpoint, expectedRightBreakpoint, NO_SWITCH, expectedBreakpointComplications, SIMPLE_INS, expectedAltSeq);
    AlignmentInterval region1 = new AlignmentInterval(new SimpleInterval("21", 17000001, 17000100), 1 ,100, TextCigarCodec.decode("100M100S"), true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
    AlignmentInterval region2 = new AlignmentInterval(new SimpleInterval("21", 17000101, 17000200), 151 ,250, TextCigarCodec.decode("100S100M"), true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
    SimpleChimera expectedSimpleChimera = new SimpleChimera(contigName, region1, region2, NO_SWITCH, true, Collections.emptyList(), NO_GOOD_MAPPING_TO_NON_CANONICAL_CHROMOSOME);
    DistancesBetweenAlignmentsOnRefAndOnRead expectedDistances = new DistancesBetweenAlignmentsOnRefAndOnRead(0, 50, 17000100, 17000101, 100, 151);
    final List<SvType> expectedSVTypes = Collections.singletonList(makeInsertionType(new SimpleInterval("21:17000100-17000100"), Allele.create("G", true),50));
    final List<VariantContext> expectedVariants = Collections.singletonList(
            addStandardAttributes(makeInsertion("21", 17000100, 17000100, 50, Allele.create("G", true)),
                    100, contigName, SimpleSVType.SupportedType.INS.name(), 17000100, 50, insertedSeq, "", insertedSeq).make());
    result.add(new TestDataForSimpleSV(region1, region2, contigName, contigSeq, false, expectedSimpleChimera, expectedNovelAdjacencyAndAltHaplotype, expectedSVTypes, expectedVariants, expectedDistances, BreakpointsInference.SimpleInsertionDeletionBreakpointsInference.class));

    // simple insertion '-' strand representation
    contigSeq = (SequenceUtil.reverseComplement(rightRefFlank) + SequenceUtil.reverseComplement(insertedSeq) + SequenceUtil.reverseComplement(leftRefFlank)).getBytes();
    contigName = "simple_ins_-";
    region1 = new AlignmentInterval(new SimpleInterval("21", 17000101, 17000200), 1 ,100, TextCigarCodec.decode("100M100S"), false, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
    region2 = new AlignmentInterval(new SimpleInterval("21", 17000001, 17000100), 151 ,250, TextCigarCodec.decode("100S100M"), false, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
    expectedDistances = new DistancesBetweenAlignmentsOnRefAndOnRead(0, 50, 17000100, 17000101, 100, 151);
    expectedSimpleChimera = new SimpleChimera(contigName, region1, region2, NO_SWITCH, false, Collections.emptyList(), NO_GOOD_MAPPING_TO_NON_CANONICAL_CHROMOSOME);
    result.add(new TestDataForSimpleSV(region1, region2, contigName, contigSeq, true, expectedSimpleChimera, expectedNovelAdjacencyAndAltHaplotype, expectedSVTypes, expectedVariants, expectedDistances, BreakpointsInference.SimpleInsertionDeletionBreakpointsInference.class));

    return result;
}
 
Example #19
Source File: BreakEndVariantType.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static String extractInsertedSequence(final NovelAdjacencyAndAltHaplotype narl, final boolean forUpstreamLoc) {
    final String ins = narl.getComplication().getInsertedSequenceForwardStrandRep();
    if (ins.isEmpty() || narl.getStrandSwitch() == StrandSwitch.NO_SWITCH) {
        return ins;
    } else {
        return forUpstreamLoc == (narl.getStrandSwitch().equals(StrandSwitch.FORWARD_TO_REVERSE) ) ? ins: SequenceUtil.reverseComplement(ins);
    }
}
 
Example #20
Source File: TrimSequenceTemplate.java    From Drop-seq with MIT License 5 votes vote down vote up
/**
 * Does the testString have part (or all) of the template in it?
 * The test string starting at position 1 may have some portion of the template.
 * For example, ANY bases of the template could have the 8 bases in the middle of a 30 base testString.
 * This is more flexible and slow than getPositionInTemplate, as it will let any portion of the template occur anywhere in the read.
 * @param testString A string to test against the template
 * @param minMatch The number of bases that must match in both
 * @param mismatchesAllowed How many mismatches can there be between the template and the read
 * @return The position in the read, 0 based.
 */
public int getPositionInRead (final String testString, final int minMatch, final int mismatchesAllowed) {
	byte [] read = StringUtil.stringToBytes(testString);
	// If the read's too short we can't possibly match it
       if (read == null || read.length < minMatch) return -1;

       int maxNumMatches=0;
       int bestReadStartPos=0;

       int lastViableReadIndex=read.length-minMatch+1;
       // Walk forwards
       READ_LOOP:  // walks through the read, looking for the template.
       for (int readIndex = 0; readIndex<lastViableReadIndex; readIndex++) {
           int mismatches = 0;
           int numMatches= 0;

           // can only search as far as you have left in the read.
           final int searchLength = Math.min(this.bases.length, read.length-readIndex);
           if (searchLength<minMatch) break;  // if you don't have enough search space left to match a min number of bases you give up.
           for (int templateIndex = 0; templateIndex < searchLength; templateIndex++) {
           	int tempReadIndex=templateIndex+readIndex;
           	char templateBase = (char)this.bases[templateIndex];
           	char readBase = (char) read[tempReadIndex];
               if (SequenceUtil.isNoCall(read[tempReadIndex]) || !SequenceUtil.basesEqual(this.bases[templateIndex], read[tempReadIndex])) {
               	if (++mismatches > mismatchesAllowed) continue READ_LOOP;
               } else
				numMatches++;
               if (numMatches>maxNumMatches) {
               	maxNumMatches=numMatches;
               	bestReadStartPos=readIndex;
               }
           }

       }
       if (maxNumMatches<minMatch) return (-1);
       return bestReadStartPos;
}
 
Example #21
Source File: ClippingUtility.java    From picard with MIT License 5 votes vote down vote up
/**
 *  Returns an array of bytes representing the bases in the read,
 *  reverse complementing them if the read is on the negative strand
 */
private static byte[] getReadBases(final SAMRecord read) {
    if (!read.getReadNegativeStrandFlag()) {
        return read.getReadBases();
    }
    else {
        final byte[] reverseComplementedBases = new byte[read.getReadBases().length];
        System.arraycopy(read.getReadBases(), 0, reverseComplementedBases, 0, reverseComplementedBases.length);
        SequenceUtil.reverseComplement(reverseComplementedBases);
        return reverseComplementedBases;
    }
}
 
Example #22
Source File: SingleBarcodeDistanceMetric.java    From picard with MIT License 5 votes vote down vote up
/**
     * Similar to Hamming distance but this version doesn't penalize matching bases with low quality for the read.

     * @return the edit distance between the barcode(s) and the read(s)
     */
public int lenientHammingDistance() {
    int numMismatches = 0;
    for (int i = 0; i < barcodeBases.length && i < maskedBases.length && numMismatches <= maximalInterestingDistance; ++i) {

        if (SequenceUtil.isNoCall(maskedBases[i])) {
            continue;
        }

        if (!SequenceUtil.basesEqual(barcodeBases[i], maskedBases[i])) {
            ++numMismatches;
        }
    }
    return numMismatches;
}
 
Example #23
Source File: BaitDesigner.java    From picard with MIT License 5 votes vote down vote up
/** Gets the bait sequence, with primers, as a String, RC'd as appropriate. */
private String getBaitSequence(final Bait bait, final boolean rc) {
    String sequence = (LEFT_PRIMER == null ? "" : LEFT_PRIMER) +
            StringUtil.bytesToString(bait.getBases()) +
            (RIGHT_PRIMER == null ? "" : RIGHT_PRIMER);

    if (rc) sequence = SequenceUtil.reverseComplement(sequence);
    return sequence;
}
 
Example #24
Source File: BaitDesigner.java    From picard with MIT License 5 votes vote down vote up
/** Method that takes in the reference sequence this bait is on and caches the bait's bases. */
public void addBases(final ReferenceSequence reference, final boolean useStrandInfo) {
    final byte[] tmp = new byte[length()];
    System.arraycopy(reference.getBases(), getStart() - 1, tmp, 0, length());

    if (useStrandInfo && isNegativeStrand()) {
        SequenceUtil.reverseComplement(tmp);
    }

    setBases(tmp);
}
 
Example #25
Source File: AdapterMarker.java    From picard with MIT License 5 votes vote down vote up
private TruncatedAdapterPair(final String name, final String threePrimeReadOrder, final String fivePrimeReadOrder) {
    this.name = name;
    this.threePrime = threePrimeReadOrder;
    this.threePrimeBytes = StringUtil.stringToBytes(threePrimeReadOrder);
    this.fivePrimeReadOrder = fivePrimeReadOrder;
    this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder);
    this.fivePrime = SequenceUtil.reverseComplement(fivePrimeReadOrder);
    this.fivePrimeBytes = StringUtil.stringToBytes(this.fivePrime);
}
 
Example #26
Source File: AdapterMarker.java    From picard with MIT License 5 votes vote down vote up
/**
 * Truncate to the given length, and in addition truncate any trailing Ns.
 */
private String substringAndRemoveTrailingNs(final String s, int length) {
    length = Math.min(length, s.length());
    final byte[] bytes = StringUtil.stringToBytes(s);
    while (length > 0 && SequenceUtil.isNoCall(bytes[length - 1])) {
        length--;
    }
    return s.substring(0, length);
}
 
Example #27
Source File: CollectWgsMetrics.java    From picard with MIT License 5 votes vote down vote up
@Override
public void addInfo(final AbstractLocusInfo<SamLocusIterator.RecordAndOffset> info, final ReferenceSequence ref, boolean referenceBaseN) {

    if (referenceBaseN) {
        return;
    }
    // Figure out the coverage while not counting overlapping reads twice, and excluding various things
    final HashSet<String> readNames = new HashSet<>(info.getRecordAndOffsets().size());
    int pileupSize = 0;
    int unfilteredDepth = 0;

    for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndOffsets()) {
        if (recs.getBaseQuality() <= 2) { ++basesExcludedByBaseq;   continue; }

        // we add to the base quality histogram any bases that have quality > 2
        // the raw depth may exceed the coverageCap before the high-quality depth does. So stop counting once we reach the coverage cap.
        if (unfilteredDepth < coverageCap) {
            unfilteredBaseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
            unfilteredDepth++;
        }

        if (recs.getBaseQuality() < collectWgsMetrics.MINIMUM_BASE_QUALITY ||
                SequenceUtil.isNoCall(recs.getReadBase())) {
            ++basesExcludedByBaseq;
            continue;
        }
        if (!readNames.add(recs.getRecord().getReadName())) {
            ++basesExcludedByOverlap;
            continue;
        }
        pileupSize++;
    }
    final int highQualityDepth = Math.min(pileupSize, coverageCap);
    if (highQualityDepth < pileupSize) basesExcludedByCapping += pileupSize - coverageCap;
    highQualityDepthHistogramArray[highQualityDepth]++;
    unfilteredDepthHistogramArray[unfilteredDepth]++;
}
 
Example #28
Source File: GenomeWarpUtils.java    From genomewarp with Apache License 2.0 5 votes vote down vote up
/**
 * Returns true if the input string is a valid DNA string,
 * which we take to mean only containing the characters ACTGactg.
 *
 * @param inString the input string sequence to test
 * @return a boolean indicating whether the input is a DNA sequence
 */
public static boolean isValidDna(String inString) {
  final byte[] in = StringUtil.stringToBytes(inString);
  for (int i = 0; i < in.length; i++) {
    if (!SequenceUtil.isValidBase(in[i])) {
      return false;
    }
  }
  return true;
}
 
Example #29
Source File: AdapterDescriptor.java    From Drop-seq with MIT License 5 votes vote down vote up
@Override
public String getSequence(SAMRecord rec) {
    final String attribute = (String) rec.getAttribute(binaryTag);
    if (reverseComplement) {
        return SequenceUtil.reverseComplement(attribute);
    } else {
        return attribute;
    }
}
 
Example #30
Source File: ExtractSequences.java    From picard with MIT License 5 votes vote down vote up
@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INTERVAL_LIST);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsWritable(OUTPUT);

    final IntervalList intervals = IntervalList.fromFile(INTERVAL_LIST);
    final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
    SequenceUtil.assertSequenceDictionariesEqual(intervals.getHeader().getSequenceDictionary(), ref.getSequenceDictionary());

    final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);

    for (final Interval interval : intervals) {
        final ReferenceSequence seq = ref.getSubsequenceAt(interval.getContig(), interval.getStart(), interval.getEnd());
        final byte[] bases = seq.getBases();
        if (interval.isNegativeStrand()) SequenceUtil.reverseComplement(bases);

        try {
            out.write(">");
            out.write(interval.getName());
            out.write("\n");

            for (int i=0; i<bases.length; ++i) {
                if (i > 0 && i % LINE_LENGTH == 0) out.write("\n");
                out.write(bases[i]);
            }

            out.write("\n");
        }
        catch (IOException ioe) {
            throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);

        }
    }

    CloserUtil.close(out);

    return 0;
}