/* * MIT License * * Copyright 2017 Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package org.broadinstitute.dropseqrna.readtrimming; import java.io.File; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; import org.broadinstitute.dropseqrna.barnyard.ParseBarcodeFile; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; public class TrimSequenceTemplate { private final String sequence; private final String reverseComplement; private final byte[] bases; @SuppressWarnings("unused") private final byte[] rcBases; private byte[] ignoredBases; public static final byte A = 'A', C = 'C', G = 'G', T = 'T'; public TrimSequenceTemplate(final String sequence, final String ignoredBases) { this.sequence = sequence; this.reverseComplement = SequenceUtil.reverseComplement(this.sequence); bases = StringUtil.stringToBytes(this.sequence); rcBases = StringUtil .stringToBytes(this.reverseComplement); this.ignoredBases = StringUtil .stringToBytes(ignoredBases); } public TrimSequenceTemplate(final String barcode) { this.sequence = barcode; this.reverseComplement = SequenceUtil.reverseComplement(this.sequence); bases = StringUtil.stringToBytes(this.sequence); rcBases = StringUtil.stringToBytes(this.reverseComplement); this.ignoredBases = StringUtil.stringToBytes("Nn"); } public String getSequence() { return this.sequence; } public String reverseComplement() { return this.reverseComplement; } /** * Test to see if this read matches this barcode. If any base of a barcode * starts with N or n, then ignore that position. * * @param testString * The read to look for this barcode in. The barcode should be at * the start of the read for this method. The entire barcode is expected for a match. * @return true if this barcode is found in the read. */ public boolean hasForwardMatch(final String testString) { byte[] testBases = StringUtil.stringToBytes(testString); int numBasesCanMatch = 0; int numBasesMatch = 0; for (int i = 0; i < bases.length; i++) { if (isIgnoreBase(this.bases[i])) continue; numBasesCanMatch++; if (SequenceUtil.basesEqual(testBases[i], bases[i])) numBasesMatch++; } if (numBasesCanMatch == numBasesMatch) return (true); return false; } /** * Does the testString have part (or all) of the template in it? * The test string starting at position 1 may have some portion of the template. * For example, the last 10 bases of the template would be the first 10 bases of the testString. * The behavior of this is a bit more restricted than getPositionInRead, as it requires you to start finding a match between the template and read, * then continue it to the end of the read. This is designed to find a some portion of the template at the end of a read, such as an adapter that's glued * on to the end of a read. * @param testString * @param minMatch * @param mismatchesAllowed * @return The position in the template */ public int getPositionInTemplate (final String testString, final int minMatch, final int mismatchesAllowed) { byte [] read = StringUtil.stringToBytes(testString); // If the read's too short we can't possibly match it if (read == null || read.length < minMatch) return -1; // read = 48 length // template seq = 41 length // Walk forwards READ_LOOP: for (int start = 0; start<this.bases.length; start++) { final int length = Math.min((this.bases.length - start), read.length); int mismatches = 0; for (int i = 0; i < length; i++) { int idx=i+start; //char base = (char)this.bases[idx]; //char readBase = (char) read[i]; if (!SequenceUtil.isNoCall(this.bases[i]) && !SequenceUtil.basesEqual(this.bases[idx], read[i])) if (++mismatches > mismatchesAllowed) continue READ_LOOP; } // If we got this far without breaking out, then it matches. Make sure it matches at least the min run of bases to return a result. int lengthMatch=this.bases.length-start-mismatches; if (lengthMatch<minMatch) return (-1); return start; } return -1; } /** * Does the testString have part (or all) of the template in it? * The test string starting at position 1 may have some portion of the template. * For example, ANY bases of the template could have the 8 bases in the middle of a 30 base testString. * This is more flexible and slow than getPositionInTemplate, as it will let any portion of the template occur anywhere in the read. * @param testString A string to test against the template * @param minMatch The number of bases that must match in both * @param mismatchesAllowed How many mismatches can there be between the template and the read * @return The position in the read, 0 based. */ public int getPositionInRead (final String testString, final int minMatch, final int mismatchesAllowed) { byte [] read = StringUtil.stringToBytes(testString); // If the read's too short we can't possibly match it if (read == null || read.length < minMatch) return -1; int maxNumMatches=0; int bestReadStartPos=0; int lastViableReadIndex=read.length-minMatch+1; // Walk forwards READ_LOOP: // walks through the read, looking for the template. for (int readIndex = 0; readIndex<lastViableReadIndex; readIndex++) { int mismatches = 0; int numMatches= 0; // can only search as far as you have left in the read. final int searchLength = Math.min(this.bases.length, read.length-readIndex); if (searchLength<minMatch) break; // if you don't have enough search space left to match a min number of bases you give up. for (int templateIndex = 0; templateIndex < searchLength; templateIndex++) { int tempReadIndex=templateIndex+readIndex; char templateBase = (char)this.bases[templateIndex]; char readBase = (char) read[tempReadIndex]; if (SequenceUtil.isNoCall(read[tempReadIndex]) || !SequenceUtil.basesEqual(this.bases[templateIndex], read[tempReadIndex])) { if (++mismatches > mismatchesAllowed) continue READ_LOOP; } else numMatches++; if (numMatches>maxNumMatches) { maxNumMatches=numMatches; bestReadStartPos=readIndex; } } } if (maxNumMatches<minMatch) return (-1); return bestReadStartPos; } /** * Does this read have the barcode anywhere in it? The Barcode would be seen * in the reverse orientation, as it's seen on the other strand. This is an * exact match test, so a barcode should not have an ignored base, but the * actual base as observed on the other strand. This is to filter out * barcodes on short inserts. * * @param readString * The read string. * @param barcode * The barcode in the forward orientation. * @return The index of the 5' most base of the barcode in the read, or -1 * if it doesn't exist. This is 1 based - IE: base 1 is the first * base. */ public static int findBarcodeIndexReverseMatch(final String readString,final String barcode) { String bc = SequenceUtil.reverseComplement(barcode); int index = readString.indexOf(bc); if (index > -1) index++; return index; } public boolean isIgnoreBase(final Byte base) { return (baseInBaseList(base, this.ignoredBases)); } public static boolean baseInBaseList(final Byte base, final byte[] baseList) { for (Byte b : baseList) if (SequenceUtil.basesEqual(b, base)) return true; return false; } public byte[] getIgnoredBases() { return this.ignoredBases; } public static Collection<TrimSequenceTemplate> parseBarcodesFromFile(final File f) { List<String> templates = ParseBarcodeFile.readCellBarcodeFile(f); List<TrimSequenceTemplate>result = templates.stream().map(x->new TrimSequenceTemplate(x)).collect(Collectors.toList()); return result; } /** * If a barcode has ignore bases, then expand those bases to A/C/G/T. * Otherwise, return the barcode. This is recursive, so multiple ignored * bases will be expanded. * * @return */ public static Collection<TrimSequenceTemplate> expandBarcode( final TrimSequenceTemplate b, final byte[] ignoredBases) { Collection<TrimSequenceTemplate> result = new ArrayList<>(); result.add(b); byte[] bases = StringUtil.stringToBytes(b.getSequence()); for (int i = 0; i < bases.length; i++) { boolean ignoreBaseFound = baseInBaseList(bases[i], ignoredBases); if (ignoreBaseFound) { result.remove(b); bases[i] = A; TrimSequenceTemplate newBC = new TrimSequenceTemplate( StringUtil.bytesToString(bases), StringUtil.bytesToString(ignoredBases)); Collection<TrimSequenceTemplate> r = expandBarcode(newBC, ignoredBases); result.addAll(r); bases[i] = C; newBC = new TrimSequenceTemplate( StringUtil.bytesToString(bases), StringUtil.bytesToString(ignoredBases)); r = expandBarcode(newBC, ignoredBases); result.addAll(r); bases[i] = G; newBC = new TrimSequenceTemplate( StringUtil.bytesToString(bases), StringUtil.bytesToString(ignoredBases)); r = expandBarcode(newBC, ignoredBases); result.addAll(r); bases[i] = T; newBC = new TrimSequenceTemplate( StringUtil.bytesToString(bases), StringUtil.bytesToString(ignoredBases)); r = expandBarcode(newBC, ignoredBases); result.addAll(r); break; // stop looping } } return (result); } @Override public String toString() { return this.sequence; } }