/* Part of Kourami HLA typer/assembler (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. See LICENSE for licensing. */ import java.util.*; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntIterator; import org.jgrapht.*; import org.jgrapht.graph.*; public class Bubble{ private HLAGraph g; private ArrayList<Integer> start; private ArrayList<Integer> end; //s- and t- nodes of each bubbles that are merged private ArrayList<Node> sNodes; private ArrayList<Node> tNodes; private ArrayList<Integer> bubbleLengths; // when bubbles get merged bubbleLengths keep track of lengths of each bubble being merged. The length of this list is always equal to the number of merged bubbles. private ArrayList<BubblePathLikelihoodScores> bubbleScores; //keeps track of paths through the whole bubble(merged or not-merged) private ArrayList<Path> paths; private boolean firstBubble; public Path getNthPath(int n){ return this.paths.get(n); } public double getNthBubbleScore(int n, int i, int j){ if(i<=j) return this.bubbleScores.get(n).getLogScore(i , j); else return this.bubbleScores.get(n).getLogScore(j , i); } public double getNthBubbleFractionScore(int n, int i, int j){ if(i<=j) return this.bubbleScores.get(n).getLogFractionScore(i , j); else return this.bubbleScores.get(n).getLogFractionScore(j , i); } public boolean isFirstBubble(){ return this.firstBubble; } public int numBubbles(){ if(this.start.size() == this.bubbleLengths.size()) return this.start.size(); else{ if(HLA.DEBUG){ HLA.log.appendln("---------------->>>>>>>>>>>>>> NOOOOOOOOOOOOOOO!!!!!! "); HLA.log.appendln("starts length:\t" + this.start.size()); HLA.log.appendln("ends length:\t" + this.end.size()); HLA.log.appendln("bubbleLengths length:\t" + this.bubbleLengths.size()); HLA.log.appendln("numPaths:\t" + this.paths.size()); } return -1000; } } public void trimPaths(int headerExcess, int tailExcess){ if(headerExcess > 0 || tailExcess > 0){ if(HLA.DEBUG) HLA.log.appendln("Trimming by :\t" + headerExcess + "\t" + tailExcess); for(Path p : paths){ p.trimPath(headerExcess, tailExcess); } } if(this.bubbleLengths.size() == 1) this.bubbleLengths.set(0, new Integer(this.bubbleLengths.get(0).intValue() - headerExcess - tailExcess)); else{ HLA.log.appendln("Something is wrong: Trimming length inconsistent. Exitting"); HLA.log.outToFile(); System.exit(-1); } } public void printBubbleSequenceSizes(){ for(Path p : this.paths) HLA.log.append(p.getBubbleSequences().size() + "\t"); HLA.log.appendln(); } public ArrayList<Integer> getStart(){ return this.start; } public ArrayList<Integer> getEnd(){ return this.end; } public ArrayList<Node> getSNodes(){ return this.sNodes; } public ArrayList<Node> getTNodes(){ return this.tNodes; } public ArrayList<Path> getPaths(){ return this.paths; } public void printBubbleSequence(){ for(Path p : this.paths){ p.printBubbleSequence(g.getGraph()); } } public void initBubbleSequences(){ for(Path p : this.paths){ p.initBubbleSequence(g.getGraph()); } } public void printResults(ArrayList<StringBuffer> interBubbleSequences){ HLA.log.appendln("Printing\t" + this.paths.size() + "\tpossible sequences" ); for(int i=0; i<this.paths.size(); i++){ Path p = this.paths.get(i); ArrayList<StringBuffer> bubbleSequences = p.getBubbleSequences(); HLA.log.appendln(">>>>>>>ITBS\t" + interBubbleSequences.size() + "\tBS\t" + bubbleSequences.size()); HLA.log.append(interBubbleSequences.get(0).toString()); for(int j=1; j<interBubbleSequences.size(); j++){ HLA.log.append("<<" + bubbleSequences.get(j-1).toString() + ">>"); HLA.log.append(interBubbleSequences.get(j).toString()); } } } public static String stripPadding(String columnSeq){ StringBuffer build = new StringBuffer(); for(int i=0; i<columnSeq.length(); i++){ char curchar = columnSeq.charAt(i); if(curchar == '.' || curchar == ' ') ; else build.append(curchar); } return build.toString(); } //print fractured bubbles //returns next startIndex public int printResults(ArrayList<StringBuffer> interBubbleSequences, int startIndex, ArrayList<DNAString> sequences, String hlagenename, int superbubbleNumber){ int tmpStartIndex = startIndex; for(int i=0; i<this.paths.size(); i++){ Path p = this.paths.get(i); int pathnum = i; DNAString curDNA = new DNAString(hlagenename, superbubbleNumber, pathnum , p.getAvgWeightedIntersectionSum() , p.getProbability()); //output.append(">" + hlagenename + "_" + superbubbleNumber + "_" + pathnum + "-" + p.getAvgWeightedIntersectionSum() + ":" + p.getProbability() + "\n"); HLA.log.appendln("IntersectionScore:\t" + p.getAvgWeightedIntersectionSum() + "\t" + p.getProbability()); ArrayList<StringBuffer> bubbleSequences = p.getBubbleSequences(); //each bubbleSequence is padded by interBubbleSequences //so we print the first interBubbleSequence. tmpStartIndex = startIndex; if(superbubbleNumber == 0 || this.firstBubble){ HLA.log.append(interBubbleSequences.get(tmpStartIndex).toString()); curDNA.append(Bubble.stripPadding(interBubbleSequences.get(tmpStartIndex).toString())); //output.append(Bubble.stripPadding(interBubbleSequences.get(tmpStartIndex).toString())); tmpStartIndex++; } for(int j=0; j<bubbleSequences.size(); j++){ HLA.log.append(" <" + bubbleSequences.get(j) + "> ");//prints the bubble curDNA.append(Bubble.stripPadding(bubbleSequences.get(j).toString())); //output.append(Bubble.stripPadding(bubbleSequences.get(j).toString())); //if path ends with bubble, we dont need to print the last interBubbleSequence if(tmpStartIndex < interBubbleSequences.size()){ HLA.log.append(interBubbleSequences.get(tmpStartIndex).toString()); //prints the interBubble curDNA.append(Bubble.stripPadding(interBubbleSequences.get(tmpStartIndex).toString())); //output.append(Bubble.stripPadding(interBubbleSequences.get(tmpStartIndex).toString())); } tmpStartIndex++; } HLA.log.appendln(); //curDNA.append("\n"); //output.append("\n"); sequences.add(curDNA); } return tmpStartIndex; } public int printResults(ArrayList<StringBuffer> interBubbleSequences, int startIndex, ArrayList<DNAString> sequences, String hlagenename, int superbubbleNumber, ArrayList<Bubble> bubbles, int bubbleOffset){ int tmpStartIndex = startIndex; //for each path (candidate allele) for(int i=0; i<this.paths.size(); i++){ Path p = this.paths.get(i); int pathnum = i; DNAString curDNA = new DNAString(hlagenename, superbubbleNumber, pathnum , p.getAvgWeightedIntersectionSum() , p.getProbability()); HLA.log.appendln("IntersectionScore:\t" + p.getAvgWeightedIntersectionSum() + "\t" + p.getProbability()); ArrayList<StringBuffer> bubbleSequences = p.getBubbleSequences(); //each bubbleSequence is padded by interBubbleSequences //so we print the first interBubbleSequence. tmpStartIndex = startIndex; for(int j=0; j<bubbleSequences.size(); j++){ if(bubbles.get(j+bubbleOffset).isFirstBubble()){ HLA.log.append("[FB(" + (j+bubbleOffset) +"]"); HLA.log.append(interBubbleSequences.get(tmpStartIndex).toString()); curDNA.append(Bubble.stripPadding(interBubbleSequences.get(tmpStartIndex).toString())); tmpStartIndex++; } HLA.log.append(" <" + bubbleSequences.get(j) + "> ");//prints the bubble curDNA.append(Bubble.stripPadding(bubbleSequences.get(j).toString())); //if path ends with bubble, we dont need to print the last interBubbleSequence if(tmpStartIndex < interBubbleSequences.size()){ HLA.log.append(interBubbleSequences.get(tmpStartIndex).toString()); //prints the interBubble curDNA.append(Bubble.stripPadding(interBubbleSequences.get(tmpStartIndex).toString())); } tmpStartIndex++; } HLA.log.appendln(); sequences.add(curDNA); } return tmpStartIndex; } public int mergePathsInSuperBubbles(ArrayList<Path> interBubblePaths, int startIndex, ArrayList<Path> resultPaths, String hlagenename, int superbubbleNumber){ int tmpStartIndex = startIndex; //for each path in this superbubble for(int i=0; i<this.paths.size(); i++){ Path p = this.paths.get(i); int pathnum = i; Path curPath = new Path(); curPath.setProbability(p.getProbability()); curPath.setWeightedIntersectionSum(p.getWeightedIntersectionSum()); curPath.setMergedNums(p.getMergedNums()); curPath.setMergedTpOpIndicies(p.getMergedTpOpIndicies()); curPath.setInterBubbleIntersectionCounts(p.getInterBubbleIntersectionCounts()); curPath.setInterBubbleIntersectionCumulativeCounts(p.getInterBubbleIntersectionCumulativeCounts()); tmpStartIndex = startIndex; /*if(superbubbleNumber == 0 || this.firstBubble){ curPath.appendAllEdges(interBubblePaths.get(tmpStartIndex)); tmpStartIndex++; }*/ //for each bubble merged in this path int k=0;//k keeps track of index for orderedEdgeList in Path for(int j=0; j<this.bubbleLengths.size(); j++){ int bubbleLength = this.bubbleLengths.get(j); int limit = k+bubbleLength; for(;k<limit;k++){ curPath.appendEdge(p.getNthEdge(k)); } if(tmpStartIndex < interBubblePaths.size()){ curPath.appendAllEdges(interBubblePaths.get(tmpStartIndex)); } tmpStartIndex++; } resultPaths.add(curPath); } return tmpStartIndex; } public int mergePathsInSuperBubbles(ArrayList<TmpPath> interBubblePaths, int startIndex, ArrayList<AllelePath> resultPaths , String hlagenename, int superbubbleNumber , SimpleDirectedWeightedGraph<Node, CustomWeightedEdge> g, ArrayList<Bubble> bubbles, int bubbleOffset){ int tmpStartIndex = startIndex; //for each path in this superbubble: each path here is fractured paths for(int i=0; i<this.paths.size(); i++){ Path p = this.paths.get(i); AllelePath curPath = new AllelePath(p.getProbability() , p.getWeightedIntersectionSum() , p.getMergedNums() //, p.getMergedOpIndicies() , p); ArrayList<ArrayList<CustomWeightedEdge>> bubbleWiseOrderedEdgeLists = p.getBubbleWiseOrderedEdgeList(this.bubbleLengths); //each bubbleWiseOrderedEdgeList is padded by interBubblePaths tmpStartIndex = startIndex; for(int j=0; j<bubbleWiseOrderedEdgeLists.size(); j++){ // in case of first Bubbles, we need to append interBubble Paths if(bubbles.get(j+bubbleOffset).isFirstBubble()){ //if(bubbleOffset > 0){//we just mark where disconnectiong in super bubble due to exonic boundaries in AllelePath curPath.setFractureEndIndex(); //} if(interBubblePaths.get(tmpStartIndex).numEdges() > 0){ //interBubblePaths.get(tmpStartIndex).toPath(g); Path tmpP = interBubblePaths.get(tmpStartIndex).toPath(g); curPath.appendAllEdges(tmpP);//interBubblePaths.get(tmpStartIndex).toPath(g)); } tmpStartIndex++; } //then we add bubble edges curPath.appendAllEdges(bubbleWiseOrderedEdgeLists.get(j)); //and then we cap with InterBubble paths if(interBubblePaths.get(tmpStartIndex).numEdges() > 0) curPath.appendAllEdges(interBubblePaths.get(tmpStartIndex).toPath(g)); tmpStartIndex++; } curPath.setSequenceString(g, superbubbleNumber, i); resultPaths.add(curPath); } return tmpStartIndex; } public int mergePathsZeroSuperBubbles(ArrayList<TmpPath> interBubblePaths, int startIndex , ArrayList<AllelePath> resultPaths, String hlagenename , int superbubbleNumber , SimpleDirectedWeightedGraph<Node, CustomWeightedEdge> g , ArrayList<Bubble> bubbles, int bubbleOffset, boolean zero){ AllelePath curPath = new AllelePath(); HLA.log.appendln("[mergePathsZeroSuperBubbles]: InterBubblePaths size: " + interBubblePaths.size()); if(interBubblePaths.size() == 1 && interBubblePaths.get(0).numEdges() > 0){ interBubblePaths.get(0).print(); Path tmpP = interBubblePaths.get(0).toPath(g); curPath.appendAllEdges(tmpP); curPath.setFractureEndIndexForNoSB(); curPath.setSequenceString(g, -1, 0); resultPaths.add(curPath); } return 1; } public Bubble(HLAGraph hg){ this(hg, null, null, null, null); } public Bubble(HLAGraph hg, Node s, Node t){ this(hg, s, t, null, null); } public Bubble(HLAGraph hg, Node s, Node t, Node[] headerNodes, Node[] tailNodes){ this(hg, s, t, false, headerNodes, tailNodes); } public Bubble(HLAGraph hg, Node s, Node t, boolean fb, Node[] headerNodes, Node[] tailNodes){ this.firstBubble = fb; this.g = hg; this.sNodes = new ArrayList<Node>(); this.tNodes = new ArrayList<Node>(); if(s!=null && t!=null){ this.sNodes.add(s); this.tNodes.add(t); } this.start = new ArrayList<Integer>(); this.end = new ArrayList<Integer>(); if(s == null){ HLA.log.appendln("[BUBBLE] start node null"); } if(s!=null && t!=null){ this.start.add(new Integer(s.getColIndex())); this.end.add(new Integer(t.getColIndex())); } this.paths = new ArrayList<Path>(); this.bubbleScores = new ArrayList<BubblePathLikelihoodScores>(); if(s != null && t != null){ this.decompose(s, t, headerNodes, tailNodes); this.removeUnsupported(hg.getGraph(), hg, s, t); } } public Bubble(HLAGraph hg, Node s, Node t, boolean fb, int headerExcessLen, int tailExcessLen, Node[] headerNodes, Node[] tailNodes){ this(hg, s, t, fb, headerNodes, tailNodes); } private void updateStart(Node newS){ if(this.sNodes.size() == 1){ this.sNodes.set(0, newS); this.start.set(0, newS.getColIndex()); } } private void updateEnd(Node newE){ if(this.tNodes.size() == 1){ this.tNodes.set(0, newE); this.end.set(0, newE.getColIndex()); } } //find all ST path in the bubble //and remove unsupported paths public ArrayList<Path> decompose(Node s, Node t, Node[] headerNodes, Node[] tailNodes){ int curBubbleSize = t.getColIndex() - s.getColIndex() + 1; //if(curBubbleSize < 20) if(HLA.DEBUG){ if(headerNodes == null && tailNodes == null){ HLA.log.append("Bubble decomposing...[bubbleSize:" + (t.getColIndex() - s.getColIndex() + 1) +"]\t"); } if(headerNodes != null){ for(Node n: headerNodes) HLA.log.appendln("HN:\t" + n.toString()); } if(tailNodes != null){ for(Node n : tailNodes) HLA.log.appendln("TN:\t" + n.toString()); } } if(headerNodes == null && tailNodes == null){ if(curBubbleSize < 10) this.paths = this.g.findAllSTPath(s, t); else this.paths = this.g.findAllSTPathPruning(s, t); }else if(headerNodes == null && tailNodes !=null){ curBubbleSize = tailNodes[0].getColIndex() - s.getColIndex() + 1; if(HLA.DEBUG){ HLA.log.append("[T]Bubble decomposing...[bubbleSize:" + curBubbleSize +"]\t"); HLA.log.appendln(s.toString() + ":"); } for(Node tn : tailNodes){ if(HLA.DEBUG) HLA.log.appendln("\t" + tn.toString()); if(curBubbleSize < 10) this.paths.addAll(this.g.findAllSTPath(s, tn)); else this.paths.addAll(this.g.findAllSTPathPruning(s, tn)); } updateEnd(tailNodes[0]); }else if(headerNodes !=null && tailNodes == null){ curBubbleSize = t.getColIndex() - headerNodes[0].getColIndex() + 1; if(HLA.DEBUG) HLA.log.append("[H]Bubble decomposing...[bubbleSize:" + curBubbleSize +"]\t"); for(Node sn : headerNodes){ if(curBubbleSize < 10) this.paths.addAll(this.g.findAllSTPath(sn, t)); else this.paths.addAll(this.g.findAllSTPathPruning(sn, t)); } updateStart(headerNodes[0]); }else{ curBubbleSize = tailNodes[0].getColIndex() - headerNodes[0].getColIndex() + 1; if(HLA.DEBUG) HLA.log.append("[HT]Bubble decomposing...[bubbleSize:" + curBubbleSize +"]\t"); for(Node sn : headerNodes){ for(Node tn : tailNodes){ if(curBubbleSize < 10) this.paths.addAll(this.g.findAllSTPath(sn, tn)); else this.paths.addAll(this.g.findAllSTPathPruning(sn, tn)); } } } if(HLA.DEBUG) HLA.log.append("Found (" + this.paths.size() + ") possible paths.\n"); //resets activePathCoutners in edges this.initPathCounters(); for(Path p : this.paths){ //p.printNumActivePaths(); p.includePath(); } for(Path p : this.paths){ p.computeReadSet(g); } //added to keep track of bubble length if(this.paths.size() > 0){ this.bubbleLengths = new ArrayList<Integer>(); this.bubbleLengths.add(new Integer(this.paths.get(0).getPathLength())); } return this.paths; } public ArrayList<BubblePathLikelihoodScores> getBubbleScores(){ return this.bubbleScores; } public ArrayList<Integer> getBubbleLengths(){ return this.bubbleLengths; } public void initPathCounters(){ for(Path p : this.paths){ p.initPathCounter(); } } public void printPaths(){ int count = 0; for(Path p : this.paths){ HLA.log.append("\tP" + count + "\t"); p.printPath(); count++; } } public int removeUnsupported(SimpleDirectedWeightedGraph<Node, CustomWeightedEdge> g, HLAGraph hg, Node s, Node t){ if(HLA.DEBUG) HLA.log.appendln("[Bubble] unsupported path removal..."); IntArrayList removalList = new IntArrayList(); /* removal of possibly erroneous path */ /* check for really low weight path compared to other paths in the bubble */ /* currently using 20% cutoff but we should move to probability model */ int sumOfReadSetSizeOfSupportedPath = 0; int[] readsetSizes = new int[this.paths.size()]; BubblePathLikelihoodScores scores = null; int bubbleSize = t.getColIndex() - s.getColIndex() + 1; for(int i=0; i<this.paths.size(); i++){ Path p = this.paths.get(i); int numEdges = p.getOrderedEdgeList().size(); if(!p.isSupportedPath()){ readsetSizes[i]=0; removalList.add(i);//new Integer(i)); if(HLA.DEBUG3){ HLA.log.append("Removing\tPath" + i + "\t"); p.printPath(); } }else{ readsetSizes[i] = p.getReadSetSize(); sumOfReadSetSizeOfSupportedPath += readsetSizes[i]; } } Collections.sort(removalList); /* FIRST we REMOVE paths with no phasing reads */ //removalList is sorted. //so we remove from the end so we dont need to worry about index change for(int i=removalList.size() - 1; i >= 0; i--){ this.paths.get(removalList.getInt(i)).excludePath(); this.paths.remove(removalList.getInt(i)); } if(HLA.DEBUG) HLA.log.appendln("Removed (" + removalList.size() + ") paths and\t(" + this.paths.size() + ") left."); /* end of phasing reads removal */ removalList = new IntArrayList(); /* update the readsetSize by copying reassetsize value larger than 0 in order */ int[] readsetSizeTmp = new int[this.paths.size()]; int index = 0; for(int rs:readsetSizes){ if(rs>0){ readsetSizeTmp[index] = rs; index++; } } readsetSizes = readsetSizeTmp; if(HLA.DEBUG){ if(this.isFirstBubble()){ HLA.log.appendln(">>>>>FIRSTBUBBLE<<<<<<"); } } if(HLA.READ_LENGTH < 200){ /* First we use a simple heuristic based (parameters) removal */ /* Focusing on removing small-weights */ for(int i=0; i<readsetSizes.length; i++){ if(readsetSizes[i] > 0){ double ratio = (1.0d * readsetSizes[i]) / ((double) sumOfReadSetSizeOfSupportedPath); if( (readsetSizes[i] <= 1 && ratio < 0.2) || (readsetSizes[i] <= 2 && ratio < 0.134) || (readsetSizes[i] <= 4 && ratio < 0.1) || (readsetSizes[i] > 4 && ratio < 0.05) ){ //if(ratio < 0.2){ removalList.add(i);//new Integer(i)); if(HLA.DEBUG3){ HLA.log.append("[Possibly errorneous path] Removing\tPath" + i + "(|RS|=" +readsetSizes[i] + ",ratio="+ ratio + ",sumReadsetSizes="+ sumOfReadSetSizeOfSupportedPath + ")\t"); this.paths.get(i).printPath(); } } } } }else{ if(this.isFirstBubble()){ for(int i=0; i<readsetSizes.length; i++){ if(readsetSizes[i] > 0){ if(HLA.DEBUG3) HLA.log.appendln("===== CHECKING PATH[" + i + "]"); double ratio = (1.0d * readsetSizes[i]) / ((double) sumOfReadSetSizeOfSupportedPath); if(ratio < 0.2){ if(HLA.DEBUG3) HLA.log.appendln("===== ratio:\t" + ratio + " ======="); CustomHashMap tMap = this.paths.get(i).getReadSet(); for(int j=0;j<readsetSizes.length;j++){ if(i!=j){ double oratio = (1.0d * readsetSizes[j]) / ((double) sumOfReadSetSizeOfSupportedPath); if(oratio > 0.7){ if(HLA.DEBUG3) HLA.log.appendln("oRatio:" + oratio + "\t>\t0.7"); CustomHashMap oMap = this.paths.get(j).getReadSet(); IntIterator itr = tMap.keySet().iterator(); int rmCount=0; while(itr.hasNext()){ int ci = itr.nextInt(); if(oMap.containsKey(ci) || oMap.containsKey(0-ci)){ if(HLA.DEBUG3) HLA.log.appendln("(" + ci +") in dominant path[" + j+"]" ); rmCount++; } } if(rmCount > 0){ double updatedRatio = ((readsetSizes[i]-rmCount)*1.0d / ((double) sumOfReadSetSizeOfSupportedPath)); if( updatedRatio <= 0.1d ){ if(HLA.DEBUG3) HLA.log.appendln("Removing due to low updatedRatio:\t" + updatedRatio ); removalList.add(i); } } } } } } } } } } /* Liklihood calculation retain function --> geared towards retaining the best */ if(this.paths.size() >= 1){ if(HLA.DEBUG3) HLA.log.appendln("RUNNING MaxLikelihoodCalcFor BubblePaths"); //double[] scoreAndIndices = this.takeMaximumLikeliPair(g);/* [Homo 0-2, Hetero 3-5] 0:homoScore 1:homoIndex1 2:homoIndex2 3:heteroScore 4:heteroIndex1 5: heteroIndex2*/ scores = this.takeMaximumLikeliPair(g); double homoScore = scores.getMaxHomoScore();//scoreAndIndices[0]; int homoIndex1 = scores.getMaxHomoGenotypeIndex();//(int) scoreAndIndices[1]; int homoIndex2 = homoIndex1;//(int) scoreAndIndices[2]; double heteroScore = scores.getMaxHeteroScore();//scoreAndIndices[3]; int heteroIndex1 = scores.getMaxHeteroGenotypeIndex1();//(int) scoreAndIndices[4]; int heteroIndex2 = scores.getMaxHeteroGenotypeIndex2();//(int) scoreAndIndices[5]; int doubleCount1 = scores.getDoubleCountH1();//(int) scoreAndIndices[6]; int doubleCount2 = scores.getDoubleCountH2();//(int) scoreAndIndices[7]; if(HLA.DEBUG3){ HLA.log.appendln("2x|H1| = " + doubleCount1); HLA.log.appendln("2x|H2| = " + doubleCount2); } boolean isAlleleWeak1 = false; boolean isAlleleWeak2 = false; int minCount = doubleCount1; int maxCount = doubleCount2; if(doubleCount1 > doubleCount2){ minCount = doubleCount2; maxCount = doubleCount1; } double frequency = (minCount*1.0d)/ ((minCount+maxCount)*1.0d); if( (minCount <= 2 && frequency < 0.2) || (minCount <= 4 && frequency < 0.15) || (minCount <= 6 && frequency < 0.14) || (minCount <= 8 && frequency < 0.1) ){ // (minCount > 8 && frequency < 0.05) ){ if(doubleCount1 > doubleCount2) isAlleleWeak2 = true; else isAlleleWeak1 = true; } //double[][] genotypeScores; //i: H1 path index, j: H2 path index if(this.paths.size() > 3){ int[] obHeteroPair = this.getObviousHeteroPair(); if(obHeteroPair != null){ if(obHeteroPair[0] != heteroIndex1 || obHeteroPair[1] != heteroIndex2){ if(HLA.DEBUG3){ HLA.log.appendln("Swapping best heterozygous genotype from [" + heteroIndex1 + ":" + heteroIndex2 + "] --> [" + obHeteroPair[0] + ":" + obHeteroPair[1] + "]" ); } heteroIndex1 = obHeteroPair[0]; heteroIndex2 = obHeteroPair[1]; heteroScore = scores.getLogScore(heteroIndex1, heteroIndex2); scores.setHeteroIndicies(obHeteroPair[0], obHeteroPair[1]); isAlleleWeak1 = false; isAlleleWeak2 = false; } } } if(HLA.DEBUG3){ HLA.log.appendln("Best homozygous genotype is :\t" + homoIndex1 + "/" + homoIndex2 + "\tscore:" + homoScore); HLA.log.appendln("Best heterozygous genotype is :\t" + heteroIndex1 + "/" + heteroIndex2 + "\tscore:" + heteroScore); if(isAlleleWeak1) HLA.log.appendln("H1 count is low : minCount/maxCount" + (minCount/2.0d) + "/" + ((minCount+maxCount)/2.0d)); else if(isAlleleWeak2) HLA.log.appendln("H2 count is low : minCount/maxCount" + (minCount/2.0d) + "/" + ((minCount+maxCount)/2.0d)); } if(HLA.READ_LENGTH >= 200){ /* if(isAlleleWeak1) removalList.add(heteroIndex1); else if(isAlleleWeak2) removalList.add(heteroIndex2); for(int i=0; i<readsetSizes.length;i++){ if( (i != heteroIndex1) && (i != heteroIndex2) ){ removalList.add(i); HLA.log.appendln("[Possibly erroneous path] Removing\tPath" + i + "\t"); this.paths.get(i).printPath(); } }*/ double diff = homoScore - heteroScore; if(hg.isClassI() && diff > 3){ for(int i = 0; i<readsetSizes.length;i++){ if( i != homoIndex1){ removalList.add(i); if(HLA.DEBUG3){ HLA.log.appendln("[Homo>200][Possibly erroneous path] Removing\tPath" + i + "\t"); this.paths.get(i).printPath(); } } } }else{ if(isAlleleWeak1){ removalList.add(heteroIndex1); if(HLA.DEBUG3){ HLA.log.appendln("[Hetero>200Weak1][Possibly erroneous path] Removing\tPath" + heteroIndex1 + "\t"); this.paths.get(heteroIndex1).printPath(); } }else if(isAlleleWeak2){ removalList.add(heteroIndex2); if(HLA.DEBUG3){ HLA.log.appendln("[Hetero>200Weak2][Possibly erroneous path] Removing\tPath" + heteroIndex2 + "\t"); this.paths.get(heteroIndex2).printPath(); } } for(int i=0; i<readsetSizes.length;i++){ if( (i != heteroIndex1) && (i != heteroIndex2) ){ removalList.add(i); if(HLA.DEBUG3){ HLA.log.appendln("[Hetero>200][Possibly erroneous path] Removing\tPath" + i + "\t"); this.paths.get(i).printPath(); } } } } }else{ double diff = homoScore - heteroScore; if(heteroIndex1 != heteroIndex2){ double hetero1Fraction = (readsetSizes[heteroIndex1]) / ((double) sumOfReadSetSizeOfSupportedPath); double hetero2Fraction = (readsetSizes[heteroIndex2]) / ((double) sumOfReadSetSizeOfSupportedPath); int hetIndex1 = heteroIndex1; int hetIndex2 = heteroIndex2; if(hetero1Fraction < hetero2Fraction){ double tmp = hetero1Fraction; hetero1Fraction = hetero2Fraction; hetero2Fraction = tmp; int tmpI = hetIndex1; hetIndex1 = hetIndex2; hetIndex2 = tmpI; } if(HLA.DEBUG3){ HLA.log.appendln("hetero1Fraction:" + hetero1Fraction +"\thetero2Fraction:" + hetero2Fraction); HLA.log.appendln("hetIndex1:" + hetIndex1 + "\thetIndex2:" + hetIndex2); } if( (hetero1Fraction + hetero2Fraction) >= 0.8 && hetero2Fraction > 0.3){ for(int i=0; i<readsetSizes.length; i++){ if(readsetSizes[i] > 0 && i != hetIndex1 && i != hetIndex2){ if( (readsetSizes[i] / ((double) sumOfReadSetSizeOfSupportedPath)) < 0.2){ removalList.add(i); if(HLA.DEBUG3){ HLA.log.append("[Possibly errorneous path] Removing\tPath" + i); this.paths.get(i).printPath(); } } } } } } if(HLA.DEBUG3) HLA.log.appendln("diff=homoScore - heteroScore : " + diff); if(diff > 0 && diff > 5.3d){ // homoScore > heteroScore for(int i=0; i<readsetSizes.length;i++){ if(i != homoIndex1){ removalList.add(i); if(HLA.DEBUG3){ HLA.log.appendln("[Possibly erroneous path] Removing\tPath" + i + "\t"); this.paths.get(i).printPath(); } } } }else{ for(int i=0;i<removalList.size();i++){ if(removalList.getInt(i) == heteroIndex1 || removalList.getInt(i) == heteroIndex2){ removalList.removeInt(i); if(HLA.DEBUG3) HLA.log.appendln("[Retaing path due to liklihood calculation] Rescueing\tPath" + i + "\t"); } } } } }else{ if(HLA.DEBUG3) HLA.log.appendln("NOT Running likelihood calc because there are less than 2 paths remaining."); } Collections.sort(removalList); int pre = -1; for(int i=0; i<removalList.size(); i++){ if(removalList.getInt(i) == pre){ removalList.removeInt(i); i--; }else pre = removalList.getInt(i); } //removalList is sorted. //so we remove from the end so we dont need to worry about index change for(int i=removalList.size() - 1; i >= 0; i--){ this.paths.get(removalList.getInt(i)).excludePath(); this.paths.remove(removalList.getInt(i)); } if(HLA.DEBUG) HLA.log.appendln("Removed (" + removalList.size() + ") paths and\t(" + this.paths.size() + ") left."); try{ scores.applyRemoval(removalList); }catch(Exception e){ e.printStackTrace(); HLA.log.outToFile(); System.exit(-9); } this.bubbleScores.add(scores); return removalList.size(); } public PathBaseErrorProb[] getDataMatrixForLikelihoodCalculation(SimpleDirectedWeightedGraph<Node, CustomWeightedEdge> g){ PathBaseErrorProb[] pathWisePathBaseErrorProbMatrices = new PathBaseErrorProb[this.paths.size()]; for(int i=0; i<this.paths.size(); i++) pathWisePathBaseErrorProbMatrices[i] = this.paths.get(i).getBaseErrorProbMatrix(g); return pathWisePathBaseErrorProbMatrices; } public int[] getObviousHeteroPair(){ if(this.paths.size() > 2){ int max_i = 0; int maxSize = this.paths.get(0).getReadSetSize(); int secondMax_i = 1; int secondMaxSize = this.paths.get(1).getReadSetSize(); int total = maxSize + secondMaxSize; if(secondMaxSize > maxSize){ max_i = 1; secondMax_i = 0; int tmp = secondMaxSize; secondMaxSize = maxSize; maxSize = tmp; } for(int i=2;i<this.paths.size();i++){ int curSize = this.paths.get(i).getReadSetSize(); total += curSize; if(curSize > maxSize){ secondMax_i = max_i; secondMaxSize = maxSize; max_i = i; maxSize = curSize; }else if(curSize > this.paths.get(secondMax_i).getReadSetSize()){ secondMax_i = i; secondMaxSize = curSize; } } if(total > 30){ double bestFrac = ( (maxSize + secondMaxSize)*1.0d ) / (total*1.0d); double secondFrac = (secondMaxSize*1.0d) / (total*1.0d); if(bestFrac > 0.75 && secondFrac > 0.3){ int[] result = new int[2]; if(max_i < secondMax_i){ result[0] = max_i; result[1] = secondMax_i; }else{ result[0] = secondMax_i; result[1] = max_i; } return result; } } } return null; } //Given all possible paths //obtain the pair that gives maxium liklihood of observing data. //ONLY considers paths that have phasing-read public BubblePathLikelihoodScores takeMaximumLikeliPair(SimpleDirectedWeightedGraph<Node, CustomWeightedEdge> g){ double readFractionScore = 0.0d; int readSum = 0; double curScore = 0.0d; //obtain path-wise PathBaseErrorProbMaxtrix PathBaseErrorProb[] pathWiseErrorProbMatrices = this.getDataMatrixForLikelihoodCalculation(g); for(int i=0; i< pathWiseErrorProbMatrices.length; i++){ if(HLA.DEBUG3) HLA.log.appendln("path[" + i + "]:\t"); char[] pathBases = pathWiseErrorProbMatrices[i].getBases(); readSum += pathWiseErrorProbMatrices[i].numReads(); if(HLA.DEBUG3){ for(char c : pathBases){ HLA.log.append(c); } HLA.log.appendln(); } } BubblePathLikelihoodScores scores = new BubblePathLikelihoodScores(this.paths.size()); //getting all possible pairs, including self for(int i=0;i<this.paths.size();i++){ for(int j=i;j<this.paths.size();j++){ //NEED TO CALL getScoreForSingleRead over All Reads and sumup the logScore. char[] pathBases1 = pathWiseErrorProbMatrices[i].getBases(); char[] pathBases2 = pathWiseErrorProbMatrices[j].getBases(); //iterate over all reads readFractionScore = 0.0d; curScore = 0.0d; Val whichH = new Val(); int doubleCountH1 = 0; int doubleCountH2 = 0; double tFraction = (pathWiseErrorProbMatrices[i].numReads() * 1.0d) / (readSum * 1.0d); double oFraction = (pathWiseErrorProbMatrices[j].numReads() * 1.0d) / (readSum * 1.0d); if(i==j){//homozygous tFraction = tFraction / 2.0d; oFraction = oFraction / 2.0d; readFractionScore += Math.log(tFraction*oFraction); }else//heterozygous readFractionScore += Math.log(tFraction*oFraction/HLA.X_FACTOR); for(int k=0; k<this.paths.size();k++){ //if( (i== 1 && j == 8) || (i==0 && j == 1) ) PathBaseErrorProb curPathErrorMatrix = pathWiseErrorProbMatrices[k]; for(int l=0; l<curPathErrorMatrix.numReads(); l++){ boolean debug = false; double readScore = this.getScoreForSingleRead( curPathErrorMatrix.getNthReadErrorProb(l) , curPathErrorMatrix.getBases() , pathBases1 , pathBases2 , whichH, debug);// ,i, j); /* double readScore = this.getScoreForSingleReadMax( curPathErrorMatrix.getNthReadErrorProb(l) , curPathErrorMatrix.getBases() , pathBases1 , pathBases2 , whichH );// ,i, j); */ curScore += readScore; //if( (i== 0 && j == 1) || (i==0 && j == 2) ){ //HLA.log.appendln("\tP(Di|G) = " + readScore); // } if(whichH.getWhichH() == 0) doubleCountH1 += 2; else if(whichH.getWhichH() == 1) doubleCountH2 += 2; else{ doubleCountH1++; doubleCountH2++; } } } //HLA.log.appendln("logP( D | Haplotype[ " + i + ":" + j + " ] ) =\t" + curScore); if(HLA.DEBUG3) HLA.log.appendln("logP( D | Haplotype[ " + i + ":" + j + " ] ) =\t" + curScore + "\t|H1|x2=" + doubleCountH1 + "\t|H2|x2=" + doubleCountH2); scores.updateMax(i, j, curScore, readFractionScore, doubleCountH1, doubleCountH2); } } return scores; } //readPhredScores --> contains ordered phredScore of readBases //readBases --> contains ordered readBases {A,C,G,T} //pathBases1 --> contains ordered bases {A,C,G,T} of the first allele //pathBases2 --> contains ordered bases {A,C,G,T} of the second allele private double getScoreForSingleRead(double[] errorProbs, char[] readBases, char[] pathBases1, char[] pathBases2, Val whichH, boolean debug){//, int path_i, int path_j){ //, columnTransition){ if(debug){ HLA.log.append("Testing:\t" ); for(char c : readBases) HLA.log.append(c); HLA.log.append("\nAgainst H1:\t"); for(char c : pathBases1) HLA.log.append(c); HLA.log.append("\tH2:\t"); for(char c : pathBases2) HLA.log.append(c); HLA.log.appendln(); } double logProb1 = 0.0d; double logProb2 = 0.0d; double gapgapmatch = 0.99d; double gappenalty = 0.01d; //for each position in read try{ for(int i=0; i<readBases.length; i++){ double errorProb = errorProbs[i];//Math.Pow(0.1, readPhredScores[i]/10.0d); double matchProb = 1.0d - errorProb; //HLA.log.appendln("read(" + i + "):\terroProb:" + errorProb + "\tmatchProb:" + matchProb ); double mismatchProb = errorProb / 3.0d; char readBase = readBases[i]; char pathBase1 = pathBases1[i]; char pathBase2 = pathBases2[i]; if(debug){ HLA.log.appendln("RB:" + readBase + "\tPB1:" + pathBases1[i] + "\tPB2:" + pathBases2[i]); HLA.log.appendln("MatchP:\t" + matchProb +"\tMismatchP:\t" + mismatchProb); } if(readBase == 'N' || pathBase1 == 'N') logProb1 += Math.log(0.25d);//matchProb/4.0d); else if(readBase == '-' || pathBase1 == '-'){ if(readBase == pathBase1) logProb1 += Math.log(gapgapmatch); else logProb1 += Math.log(gappenalty); }else if(readBase == pathBase1){ if(debug) HLA.log.appendln("MatchPB1"); logProb1 += Math.log(matchProb); }else{ if(debug) HLA.log.appendln("MismatchPB1"); logProb1 += Math.log(mismatchProb); } if(readBase == 'N' || pathBase2 == 'N') logProb2 += Math.log(0.25d);//matchProb/4.0d); else if(readBase == '-' || pathBase2 == '-'){ if(readBase == pathBase2) logProb2 += Math.log(gapgapmatch); else logProb2 += Math.log(gappenalty); }else if(readBase == pathBase2){ if(debug) HLA.log.appendln("MatchPB2"); logProb2 += Math.log(matchProb); }else{ if(debug) HLA.log.appendln("MismatchPB2"); logProb2 += Math.log(mismatchProb); } //HLA.log.appendln("logProb1: " + logProb1 + "\tlogProb2: " + logProb2); //logProb += Math.log(avgProb); //logProb += columnTransition --> need to add transitionProb } }catch(ArrayIndexOutOfBoundsException e){ HLA.log.appendln("|eProbs| :" + errorProbs.length); HLA.log.appendln("|rBases| :" + readBases.length); HLA.log.appendln("|pBase1| :" + pathBases1.length); HLA.log.appendln("|pBase2| :" + pathBases2.length); HLA.log.outToFile(); e.printStackTrace(); System.exit(-1); } /* putting assignment count */ if(logProb1 > logProb2) whichH.set(0); else if(logProb1 == logProb2) whichH.set(-1); else whichH.set(1); /* end of assignment count*/ double maxLog = (logProb1 > logProb2 ? logProb1 : logProb2); /* if( (path_i == 0 && path_j ==1) || (path_i ==1) && (path_j==8) ) HLA.log.append("logProb1: " + logProb1 + "\tlogProb2: " + logProb2 + "\tsum: " + (logProb1 + logProb2)); */ //HLA.log.appendln("\tlogProb1: " + logProb1 + "\tlogProb2: " + logProb2); return maxLog + Math.log(Math.exp(logProb1-maxLog) + Math.exp(logProb2-maxLog)) - Math.log(2); //return logProb; } //readPhredScores --> contains ordered phredScore of readBases //readBases --> contains ordered readBases {A,C,G,T} //pathBases1 --> contains ordered bases {A,C,G,T} of the first allele //pathBases2 --> contains ordered bases {A,C,G,T} of the second allele private double getScoreForSingleReadMax(double[] errorProbs, char[] readBases, char[] pathBases1, char[] pathBases2, Val whichH){ //, columnTransition){ double logProb1 = 0.0d; double logProb2 = 0.0d; double gapgapmatch = 0.99d; double gappenalty = 0.01d; //for each position in read try{ for(int i=0; i<readBases.length; i++){ double errorProb = errorProbs[i];//Math.Pow(0.1, readPhredScores[i]/10.0d); double matchProb = 1.0d - errorProb; double mismatchProb = errorProb / 3.0d; char readBase = readBases[i]; char pathBase1 = pathBases1[i]; char pathBase2 = pathBases2[i]; //double avgProb = 0.0d; if(readBase == 'N' || pathBase1 == 'N') logProb1 += Math.log(0.25d);//matchProb/4.0d); else if(readBase == '-' || pathBase1 == '-'){ if(readBase == pathBase1) logProb1 += Math.log(gapgapmatch); else logProb1 += Math.log(gappenalty); }else if(readBase == pathBase1) logProb1 += Math.log(matchProb); else logProb1 += Math.log(mismatchProb); if(readBase == 'N' || pathBase2 == 'N') logProb2 += Math.log(0.25d);//matchProb/4.0d); else if(readBase == '-' || pathBase2 == '-'){ if(readBase == pathBase2) logProb2 += Math.log(gapgapmatch); else logProb2 += Math.log(gappenalty); }else if(readBase == pathBase2) logProb2 += Math.log(matchProb); else logProb2 += Math.log(mismatchProb); //logProb += Math.log(avgProb); //logProb += columnTransition --> need to add transitionProb } }catch(ArrayIndexOutOfBoundsException e){ HLA.log.appendln("|eProbs| :" + errorProbs.length); HLA.log.appendln("|rBases| :" + readBases.length); HLA.log.appendln("|pBase1| :" + pathBases1.length); HLA.log.appendln("|pBase2| :" + pathBases2.length); HLA.log.outToFile(); e.printStackTrace(); System.exit(-1); } HLA.log.appendln("\tlogProb1: " + logProb1 + "\tlogProb2: " + logProb2); if(logProb1 > logProb2){ whichH.set(0); return logProb1; }else if(logProb1 == logProb2){ whichH.set(-1); return logProb1; }else{ whichH.set(1); return logProb2; } } /* same as removeUnsupported except this only cares about unique edges */ public int removeUnsupportedUniqueEdgeOnly(){ //HashSet<Integer> readHash; //ArrayList<Integer> removalList = new ArrayList<Integer>(); CustomHashMap readHash; IntArrayList removalList = new IntArrayList(); for(int i=0; i<this.paths.size(); i++){//Path p: this.paths){ Path p = this.paths.get(i); ArrayList<CustomWeightedEdge> eList = p.getOrderedEdgeList(); boolean inited = false; readHash = null; int numUnique = 0; for(CustomWeightedEdge e : eList){ HLA.log.append("|" + e.getNumActivePath() + "|"); if(e.isUniqueEdge()){ //we only care about uniq edges numUnique++; if(!inited){ readHash = e.getReadHashSet().clone();//e.getReadHashSetDeepCopy(); inited = true; }else{ //update with union after checking intersection //if null, we remove this path if(e.unionAfterCheckingIntersection(readHash) == null){ removalList.add(i);//new Integer(i)); break; } } } } HLA.log.append("[" + numUnique + "]"); } for(int i=removalList.size() - 1; i >= 0; i--){ this.paths.get(removalList.getInt(i)).excludePath(); this.paths.remove(removalList.getInt(i)); } return removalList.size(); } //should only be used between two simple bubble //returns int[][] this.paths.sizes() x other.getPaths.size() array. public int[][] getIntersectionCount(Bubble other){ //int intersectionSizesSum = 0; int[][] intersectionCounts = new int[this.paths.size()][other.getPaths().size()]; for(int i=0;i<this.paths.size();i++){ Path tp = this.paths.get(i); for(int j=0; j<other.getPaths().size();j++){ Path op = other.getPaths().get(j); int intersectionSize = tp.isPhasedWith(op); if(intersectionSize >= Path.MIN_SUPPORT_PHASING) intersectionCounts[i][j] = intersectionSize; } } return intersectionCounts; } public MergeStatus mergeBubble(Bubble other, int lastSegregationColumnIndex, boolean isClassII, Bubble lastMergedBubble, SimpleDirectedWeightedGraph<Node, CustomWeightedEdge> g){ int pathLength = this.getEnd().get(this.getEnd().size()-1) - this.getStart().get(0); boolean isOtherFirstInInterval = false; if(other.isFirstBubble()){ isOtherFirstInInterval = true; } MergeStatus ms = new MergeStatus(false, lastSegregationColumnIndex); int distanceToLastSegregation = other.getStart().get(0) - lastSegregationColumnIndex; //public boolean mergeBubble(Bubble other){ ArrayList<Path> paths_new = new ArrayList<Path>(); //boolean[] tpUsed = new boolean[this.paths.size()]; //boolean[] opUsed = new boolean[other.getPaths().size()]; if(HLA.DEBUG) HLA.log.appendln(">>>>>>>>>>>>> getting intersecti <<<<<<<<<<<<"); int[][] interBubbleIntersectionSizes = lastMergedBubble.getIntersectionCount(other); int[][] interBubbleIntersectionCumulativeSizes = new int[this.paths.size()][other.getPaths().size()]; /* path used counters */ int[] tpUsed = new int[this.paths.size()]; int[] opUsed = new int[other.getPaths().size()]; if(HLA.DEBUG){ /* print this paths (DEBUGGIN) */ for(int i=0;i<this.paths.size();i++){ Path tp = this.paths.get(i); HLA.log.append("TP(" + i + ")\t<readNum:" + tp.getReadSetSize() + ">\t"); tp.printInfo(); tp.printReadSet(); } /* print other paths (DEBUGGIN) */ for(int i=0;i<other.getPaths().size();i++){ Path op = other.getPaths().get(i); HLA.log.append("OP(" + i + ")\t<readNum:" + op.getReadSetSize() + ">\t"); op.printInfo(); op.printReadSet(); } } ArrayList<int[]> phasedList = new ArrayList<int[]>(); ArrayList<Integer> intersectionSizes = new ArrayList<Integer>(); int intersectionSizesSum = 0; int[] intersectionSizesTPSum = new int[this.paths.size()]; int[] intersectionSizesOPSum = new int[other.getPaths().size()]; /* check possible paths TP X OP */ for(int i=0;i<this.paths.size();i++){ Path tp = this.paths.get(i); for(int j=0; j<other.getPaths().size(); j++){ Path op = other.getPaths().get(j); if(HLA.DEBUG) HLA.log.append("TP(" + i + ")\tX\tOP("+j+"):\t"); int intersectionSize = tp.isPhasedWith(op); interBubbleIntersectionCumulativeSizes[i][j] = intersectionSize; if(intersectionSize >= Path.MIN_SUPPORT_PHASING){ //if(tp.isPhasedWith(op)){ //paths_new.add(tp.mergePaths(op)); intersectionSizesSum += intersectionSize; intersectionSizesTPSum[i] += intersectionSize; intersectionSizesOPSum[j] += intersectionSize; intersectionSizes.add(new Integer(intersectionSize)); int[] tmp = new int[2]; tmp[0] = i; tmp[1] = j; phasedList.add(tmp); tpUsed[i]++; opUsed[j]++; } } //decision to split is done after all TP X OP checking /* //split first, we will use imputation to connect these spots. if(tpUsed[i] == 0){ int pathLength = this.getEnd().get(this.getEnd().size()-1) - this.getStart().get(0); HLA.log.appendln("LOSING TP(" + i + "):\tcurLen:"+pathLength + "\td2ls:" + distanceToLastSegregation); if(pathLength >= HLA.READ_LENGTH || distanceToLastSegregation >= (0.5 * HLA.READ_LENGTH)){ ms.setSplit(true); HLA.log.appendln("CANT PHASE FURTHER. SPLITTING..."); return ms; } }*/ } boolean otherSignificantSignal = false; for(Integer i : intersectionSizes){ if(i.intValue() >= Path.MIN_SIGNIFICANT_INTERSECTION_SIZE_WHEN_PRUNING) otherSignificantSignal = true; } /* Check paths that are being lost due to no phasing reads */ for(int i=0; i<this.paths.size(); i++){ //there is no more phasing reads/pairs if(tpUsed[i] == 0){ int lastColumnIndex = this.paths.get(i).getLastUniqueEdgeColumnNumber(g, false); int curColumnIndex = other.getStart().get(0); int pathSpecificLastSegregation = curColumnIndex - lastColumnIndex + 1; //int pathLength = this.getEnd().get(this.getEnd().size()-1) - this.getStart().get(0); if(HLA.DEBUG) HLA.log.appendln("LOSING TP(" + i + "):\tcurLen:"+pathLength + "\td2ls:" + distanceToLastSegregation + "\td2psls:"+ pathSpecificLastSegregation); if(!otherSignificantSignal){ if( (pathLength >= HLA.READ_LENGTH && distanceToLastSegregation >= (0.5 * HLA.READ_LENGTH)) ){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[FIRST CHECK]CANT PHASE FURTHER. SPLITTING..."); //return ms; }else if(pathLength >= HLA.READ_LENGTH && pathSpecificLastSegregation >= (0.75 * HLA.READ_LENGTH)){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[LONG SEGREGATION] CANT PHASE FURTHER. SPLITTING..."); }else if( distanceToLastSegregation >= (1.5 * HLA.READ_LENGTH) ){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[STRONG SIGNAL]CANT PHASE FURTHER. SPLITTING..."); }else if(isClassII && pathLength >=200){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[CLASS II LENGTH]CANT PHASE FURTHER. SPLITTING..."); }else if( this.paths.size() == 2 && phasedList.size() == 1 && pathLength >= 0.5 * HLA.READ_LENGTH && distanceToLastSegregation >= (0.7 * HLA.READ_LENGTH) ){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[NECESSARY] CANT PHASE FURTHER. SPLITTING..."); } }else{ if( distanceToLastSegregation >= (1.5 * HLA.READ_LENGTH) ){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[STRONG SIGNAL]CANT PHASE FURTHER. SPLITTING..."); }else if(isClassII && pathLength >=200){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[CLASS II LENGTH]CANT PHASE FURTHER. SPLITTING..."); }else if(pathLength >= HLA.READ_LENGTH && pathSpecificLastSegregation >= (0.75 * HLA.READ_LENGTH)){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("[LONG SEGREGATION] CANT PHASE FURTHER. SPLITTING..."); } } } } if(ms.isSplit()){ if(HLA.DEBUG){ for(int i=0; i< this.paths.size(); i++){ HLA.log.append("TP(" + i + "):\t"); this.paths.get(i).printReadSet(); } } //HLA.log.appendln("CANT PHASE FURTHER. SPLITTING..."); return ms; } /* END OF check for no-phasing paths*/ if(HLA.DEBUG) HLA.log.appendln("TOTAL of " + phasedList.size() + "\tphased paths."); int origSizeSum = intersectionSizesSum; /* for(int j=0; j<opUsed.length; j++){ if(opUsed[j] > 1 ){//&& opUsed[j] < phasedList.size()){ for(int k=0; k<phasedList.size();k++){ int[] ijs = phasedList.get(k); if(ijs[1] == j){ int curSize = intersectionSizes.get(k); double d = (intersectionSizes.get(k) * 1.0d) / (origSizeSum * 1.0d); HLA.log.appendln("Checking branch:\td:" + d + "\tcurSize:" + curSize + "\tTP(" + ijs[0] + ")\tx\tOP(" + ijs[1] + ")"); //if( (curSize <2 && d < 0.1 ) || (curSize < 3 && d<0.06) || (curSize >= 3 && d <0.05) ){ if( (curSize <3 && d < 0.1 ) || (curSize >= 3 && d <0.05) ){ HLA.log.appendln("Pruning branch:\td:"+d+"\tTP(" + ijs[0] + ")\tx\tOP(" + ijs[1] + ")"); phasedList.remove(k); intersectionSizesSum -= intersectionSizes.get(k); intersectionSizes.remove(k); k--;//update the index since we took k out. opUsed[j]--; //added 10/04/16 tpUsed[ijs[0]]--; if(tpUsed[ijs[0]] == 0){ int pathLength = this.getEnd().get(this.getEnd().size()-1) - this.getStart().get(0); HLA.log.appendln("Pruning results in LOSING TP(" + ijs[0] + "):\tcurLen:"+pathLength + "\td2ls:" + distanceToLastSegregation); if(pathLength >= HLA.READ_LENGTH && distanceToLastSegregation >= (0.5 * HLA.READ_LENGTH)){ ms.setSplit(true); HLA.log.appendln("CANT PHASE FURTHER. SPLITTING... (PRUNING-INDUCED)"); return ms; }else if(isClassII && pathLength >=200){ ms.setSplit(true); HLA.log.appendln("CANT PHASE FURTHER. SPLITTING... (CLASS II LENGTH)"); return ms; } } if(tpUsed[ijs[0]] == 1){ HLA.log.appendln("PRUNING RESULTS IN MORE AGRESSIVE INTERSECTION FOR TP( " + ijs[0] + " )"); } //end of added 10/04/16 } } } } } */ int[] tpUsedCopy = tpUsed.clone(); int[] opUsedCopy = opUsed.clone(); int origPhasedPathNum = phasedList.size(); for(int i=0; i<phasedList.size();i++){ int[] ijs = phasedList.get(i); Path tp = this.paths.get(ijs[0]); int tpUsedCtOrig = tpUsedCopy[ijs[0]]; int tpUsedCt = tpUsed[ijs[0]]; Path op = other.getPaths().get(ijs[1]); int opUsedCtOrig = opUsedCopy[ijs[1]]; int opUsedCt = opUsed[ijs[1]]; if(HLA.DEBUG) HLA.log.appendln("[tpUsedCtOrig]: " + tpUsedCtOrig + "\t[tpUsedCt]: " + tpUsedCt + "\n[opUsedCtOrig]: " + opUsedCtOrig + "\t[opUsedCt]: " + opUsedCt); //either TP or OP is being split. if((tpUsedCtOrig > 1 || opUsedCtOrig > 1) && (tpUsedCt > 0) && (opUsedCt > 0)){ int curSize = intersectionSizes.get(i); double d = (curSize*1.0d) / (origSizeSum * 1.0d); double tpWiseRatio = (curSize*1.0d) / (intersectionSizesTPSum[ijs[0]] * 1.0d); double opWiseRatio = (curSize*1.0d) / (intersectionSizesOPSum[ijs[1]] * 1.0d); if(HLA.DEBUG) HLA.log.appendln("Checking branch:\td:" + d + "\ttpWiseRatio:" + tpWiseRatio + "\topWiseRatio:" + opWiseRatio + "\tcurSize:" + curSize + "\tpathLength:" + pathLength + "\tTP(" + ijs[0] + ")\tx\tOP(" + ijs[1] + ")"); if( ((curSize <3 && d < 0.1 ) || (curSize >= 3 && d <0.05) || (tpWiseRatio < 0.22) || (curSize >=2 && origPhasedPathNum == 2 && other.getPaths().size() == 1 && opWiseRatio < 0.1) || (curSize >=2 && d <0.15 && origPhasedPathNum > 2 && tpWiseRatio > 0.9 && opWiseRatio < 0.15) || (curSize >=2 && d <0.15 && this.paths.size() == 2 && origPhasedPathNum > 3 && tpWiseRatio < 0.25 && opWiseRatio >=0.65)) ){ if( (tpWiseRatio > 0.8 && opWiseRatio > 0.05 && origPhasedPathNum == 2 && pathLength > (HLA.READ_LENGTH/2)) || (tpWiseRatio == 1.0d && d >0.075 && pathLength > (HLA.READ_LENGTH*0.7)) || (tpWiseRatio < 0.22 && isClassII && curSize > 5 && pathLength < 50) ){ ;//dont prune. }else{ if(HLA.DEBUG) HLA.log.appendln("Pruning branch:\td:"+d+"\tTP(" + ijs[0] + ")\tx\tOP(" + ijs[1] + ")"); phasedList.remove(i); intersectionSizesSum -= curSize; intersectionSizes.remove(i); i--; tpUsed[ijs[0]]--; opUsed[ijs[1]]--; if(tpUsed[ijs[0]] == 0){ int lastColumnIndex = this.paths.get(ijs[0]).getLastUniqueEdgeColumnNumber(g, false); int curColumnIndex = other.getStart().get(0); int pathSpecificLastSegregation = curColumnIndex - lastColumnIndex + 1; if(HLA.DEBUG) HLA.log.appendln("lastUniqueColumn:" + lastColumnIndex + "\tcurColumnIndex:" + curColumnIndex ); //int pathLength = this.getEnd().get(this.getEnd().size()-1) - this.getStart().get(0); if(HLA.DEBUG) HLA.log.appendln("Pruning results in LOSING TP(" + ijs[0] + "):\tcurLen:"+pathLength + "\td2ls:" + distanceToLastSegregation + "\td2psls:"+ pathSpecificLastSegregation); if(pathLength >= HLA.READ_LENGTH && distanceToLastSegregation >= (0.5 * HLA.READ_LENGTH)){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("CANT PHASE FURTHER. SPLITTING... (PRUNING-INDUCED)"); return ms; }else if(pathLength >= HLA.READ_LENGTH && pathSpecificLastSegregation >= (0.6 * HLA.READ_LENGTH)){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("CANT PHASE FURTHER. SPLITTING... (PRUNING-INDUCED: LONG SEGREGATION)"); return ms; }else if(isClassII && pathLength >=200){ ms.setSplit(true); if(HLA.DEBUG) HLA.log.appendln("CANT PHASE FURTHER. SPLITTING... (CLASS II LENGTH)"); return ms; } } if(HLA.DEBUG){ if(tpUsed[ijs[0]] == 1) HLA.log.appendln("PRUNING RESULTS IN MORE AGRESSIVE INTERSECTION FOR TP( " + ijs[0] + " )"); } } } } } int opUsageNum = 0; for(int n : opUsed){ if(n > 0) opUsageNum++; } //for(int[] ijs : phasedList){ for(int i=0;i<phasedList.size();i++){ int[] ijs = phasedList.get(i); int intersectionSize = intersectionSizes.get(i); Path tp = this.paths.get(ijs[0]); Path op = other.getPaths().get(ijs[1]); if(tp.getNumUniqueEdges() > 0){ if(HLA.DEBUG) HLA.log.append("SETTING LAST KNOWN UNIQUE EDGE COLUMN NUMBER AS -->\t"); tp.setLastKnownUniqueEdgeColumnNumber(tp.getLastUniqueEdgeColumnNumber(g, false)); if(HLA.DEBUG) HLA.log.appendln(tp.getLastKnownUniqueEdgeColumnNumber()); }else{ if(HLA.DEBUG) HLA.log.appendln("CANT SET BUT LAST KNOWN UNIQUE COLUMN NUMBER IS -->\t" + tp.getLastKnownUniqueEdgeColumnNumber()); } //if tp and op are used once, merged path between tp and op is the only PATH if(tpUsed[ijs[0]] == 1 && opUsed[ijs[1]] == 1){ Path tmpp = tp.mergePathsUnique(op); paths_new.add(tmpp); } //tp is used once op is used multiple times, we pass tp readset. else if(tpUsed[ijs[0]] == 1 && opUsed[ijs[1]] > 1){ Path tmpp = tp.mergePath1toMany(op); paths_new.add(tmpp); } //tp is used multiple time, op is used once, we pass op readset. else if(tpUsed[ijs[0]] > 1 && opUsed[ijs[1]] == 1){ Path tmpp = tp.mergePathManyto1(op); paths_new.add(tmpp); } //tp is used multiple time, op is used multiple times, we pass intersection. else if(tpUsed[ijs[0]] > 1 && opUsed[ijs[1]] > 1){ Path tmpp = tp.mergePathManytoMany(op); paths_new.add(tmpp); }else{ HLA.log.appendln("SOMETHING IS WRONG. [Bubble.java mergeBubble()]"); HLA.log.outToFile(); System.exit(-1); } paths_new.get(paths_new.size() - 1).updateIntersectionSum(intersectionSize, intersectionSizesSum, ijs, interBubbleIntersectionSizes, interBubbleIntersectionCumulativeSizes); } if(HLA.DEBUG) HLA.log.appendln(paths_new.size() + "\tphased paths in paths_new"); /* edge usage update for TP */ for(int i=0; i<this.paths.size();i++){ if(tpUsed[i] == 0) this.paths.get(i).excludePath(); else this.paths.get(i).includePathNTimes(tpUsed[i]-1); } /* edge usage update for OP */ for(int i=0; i<other.getPaths().size();i++){ if(opUsed[i] == 0) other.getPaths().get(i).excludePath(); else other.getPaths().get(i).includePathNTimes(opUsed[i]-1); } if(paths_new.size() > 0){ this.paths = paths_new; if(HLA.DEBUG){ HLA.log.appendln("NEW PATHS"); for(Path p : this.paths) HLA.log.appendln("KnownEdgeCI:\t" + p.getLastKnownUniqueEdgeColumnNumber()); } this.start.addAll(other.getStart()); this.end.addAll(other.getEnd()); this.sNodes.addAll(other.getSNodes()); this.tNodes.addAll(other.getTNodes()); this.bubbleLengths.addAll(other.getBubbleLengths()); this.bubbleScores.addAll(other.getBubbleScores()); } //if we have segregation, meaning we use 2 or more OP(other path) if(opUsageNum > 1){ ms.setSegregating(true); ms.setLastSegregationColumnIndex(other.getEnd().get(other.getEnd().size() - 1)); ms.setSplit(false); } return ms; //return true; //}//end else } /* returns indicies for paths that are being phased between two super bubbles * int[0] : path index for this superBubble * int[1] : path index for other supperBubble (sbo) * int[2] : intersectionSize */ public ArrayList<int[]> getPhasedSuperBubbles(Bubble sbo){ ArrayList<int[]> phasedList = new ArrayList<int[]>(); //stp: super-this-path //sop: super-other-path for(int i=0; i<this.paths.size();i++){ Path stp = this.paths.get(i); for(int j=0; j < sbo.getPaths().size(); j++){ Path sop = sbo.getPaths().get(j); if(HLA.DEBUG) HLA.log.append("STP(" + i + ")\tX\tSOP("+j+"):\t"); int intersectionSize = stp.isPhasedWith(sop); //modified so that all stp-sop pair are stored even if they were less the MIN_support_phasing //if(intersectionSize >= Path.MIN_SUPPORT_PHASING){ int[] tmp = new int[3]; tmp[0] = i; tmp[1] = j; tmp[2] = intersectionSize; phasedList.add(tmp); //} } } return phasedList; } public int getNumPaths(){ return this.paths.size(); } }