java source code of Scaffolder

import graphs.MyEdge;
import graphs.MyGraph;
import graphs.MyNode;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.UnrecognizedOptionException;
import org.biojava3.core.sequence.ProteinSequence;
import org.biojava3.core.sequence.io.FastaReaderHelper;
import utilities.GexfReader;
import utilities.GexfWriter;
import utilities.N50;

public class Scaffolder {
	public static final int VERSION_MAJOR = 1;
	public static final int VERSION_MINOR = 6;
    public static void main(String[] args) throws Exception {
		new Scaffolder(args);
	}

	public Scaffolder(String[] args) throws Exception {
		runOnTerminal(args);
	}

	@SuppressWarnings("static-access")
	private void runOnTerminal(String[] args) throws Exception {
		Options opts = new Options();

		Option input = OptionBuilder
				.withArgName("<targetGenome>")
				.hasArgs(1)
				.withValueSeparator()
				.withDescription(
						"REQUIRED PARAMETER;The option *-i* indicates the name of the target genome file.")
				.create("i");
		opts.addOption(input);

		Option output = OptionBuilder
				.withArgName("<outputName>")
				.hasArgs(1)
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER; The option *-o* indicates the name of output fasta file.")
				.create("o");
		opts.addOption(output);

		Option f = OptionBuilder
				.withArgName("<draftsFolder>")
				.hasArgs(1)
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER; The option *-f* is optional and indicates the path to the comparison drafts folder")
				.create("f");
		opts.addOption(f);

		Option scriptPath = OptionBuilder
				.withArgName("<medusaScriptsFolder>")
				.hasArgs(1)
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER; The folder containing the medusa scripts. Default value: medusa_scripts")
				.create("scriptPath");
		opts.addOption(scriptPath);

		Option verbose = OptionBuilder
				.withValueSeparator()
				.withDescription(
						"RECOMMENDED PARAMETER; The option *-v* (recommended) print on console the information given by the package MUMmer. This option is strongly suggested to understand if MUMmer is not running properly.")
				.create("v");
		opts.addOption(verbose);

		Option weightScheme2 = OptionBuilder
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER;The option *-w2* is optional and allows for a sequence similarity based weighting scheme. Using a different weighting scheme may lead to better results.")
				.create("w2");
		opts.addOption(weightScheme2);

		Option gexf = OptionBuilder
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER;Conting network and path cover are given in gexf format.")
				.create("gexf");
		opts.addOption(gexf);

                Option threads = OptionBuilder
                                .withArgName("<numberOfThreads>")
                                .hasArgs(1)
                                .withValueSeparator()
                                .withDescription(
                                                "OPTIONAL PARAMETER; The option *-threads* indicates the number of threads to be used with mummer (requires version >= 4.0)")
                                .create("threads");
                opts.addOption(threads);

		Option help = OptionBuilder.withValueSeparator()
				.withDescription("Print this help and exist.").create("h");
		opts.addOption(help);

		Option distances = OptionBuilder
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER;The option *-d* allows for the estimation of the distance between pairs of contigs based on the reference genome(s): in this case the scaffolded contigs will be separated by a number of N characters equal to this estimate. The estimated distances are also saved in the <targetGenome>_distanceTable file. By default the scaffolded contigs are separated by 100 Ns")
				.create("d");
		opts.addOption(distances);

		Option random = OptionBuilder
				.withArgName("<numberOfRounds>")
				.hasArgs(1)
				.withValueSeparator()
				.withDescription(
						"OPTIONAL PARAMETER;The option *-random* is available (not required). This option allows the user to run a given number of cleaning rounds and keep the best solution. Since the variability is small 5 rounds are usually sufficient to find the best score.")
				.create("random");
		opts.addOption(random);

		Option n50 = OptionBuilder.withArgName("<fastaFile>").hasArgs(1)
				.withValueSeparator()
				.withDescription("OPTIONAL PARAMETER; The option *-n50* allows the calculation of the N50 statistic on a FASTA file. In this case the usage is the following: java -jar medusa.jar -n50 <name_of_the_fasta>. All the other options will be ignored.")
				.create("n50");
		opts.addOption(n50);

		BasicParser bp = new BasicParser();
		try {

			CommandLine cl = bp.parse(opts, args);
			if (cl.hasOption("h")
					|| (!cl.hasOption("i") && !cl.hasOption("n50"))) {
				printHelp(opts);
			}else if (cl.hasOption("n50")) {
				n50avaluation(cl);
			}else if (cl.hasOption("i")) {
				scaffolder(cl);
			}
		} catch (UnrecognizedOptionException uoe) {
			printHelp(opts);
		}
	}

	private void printHelp(Options opts) {
		System.out.println(String.format("Medusa version %d.%d", VERSION_MAJOR,
				VERSION_MINOR));
		HelpFormatter f1 = new HelpFormatter();
		f1.printHelp("java -jar medusa.jar -i inputfile -v",
				"available options: ", opts, "");

	}

	private int computeLenght(ArrayList<String> paths) {
		int l = 0;
		String current;
		for (int i = 0; i < paths.size(); i++) {
			current = paths.get(i);
			String[] currentSplit = current.split("@");
			int le = Integer.parseInt(currentSplit[1].replaceFirst("@", ""));
			l += le;
		}
		return l;
	}

	private void scaffolder(CommandLine cl) throws Exception {

		String input = cl.getOptionValues("i")[0];
		System.out.println("INPUT FILE:" + input);
		int rounds = 1;
		if (cl.getOptionValue("random") != null) {
			rounds = Integer.parseInt(cl.getOptionValue("random"));
			System.out.println("Random rounds:" + rounds);
		}
		String scaffoldsfilename = input + "Scaffold.fasta";
		if (cl.getOptionValue("o") != null) {
			scaffoldsfilename = cl.getOptionValue("o");
		}
		String draftsFolder = "drafts";
		if (cl.getOptionValue("f") != null) {
			draftsFolder = cl.getOptionValue("f");
		}
		String medusaScripts = "medusa_scripts";
		if (cl.getOptionValue("scriptPath") != null) {
			medusaScripts = cl.getOptionValue("scriptPath");
		}
		Boolean distanceEstimation =false;
		if(cl.hasOption("d")){
			distanceEstimation = true;
		}
		String threads = "";
                if (cl.getOptionValue("threads") != null) {
                        threads = cl.getOptionValue("threads");
                }

		/*
		 * #######################################################################
		 * STEP1: Mapping the target contigs onto the comparison genomes using
		 * the MUMmer software.
		 * #######################################################################
		 *
		 */

		System.out.println("------------------------------------------------------------------------------------------------------------------------");
		String line;
		System.out.print("Running MUMmer...");
		Process process = new ProcessBuilder(medusaScripts + "/mmrBatch.sh",
				draftsFolder, input, medusaScripts,threads).start();
		BufferedReader errors = new BufferedReader(new InputStreamReader(
				process.getErrorStream()));
		if (cl.hasOption("v")) {
			while ((line = errors.readLine()) != null) {
				System.out.println(line);
			}
			if (process.waitFor() != 0) {
				throw new RuntimeException("Error running MUMmer.");
			}
		} else {
			while ((line = errors.readLine()) != null) {
			}
			if (process.waitFor() != 0) {
				throw new RuntimeException("Error running MUMmer.");
			}
		}

		System.out.print("done.\n");


		/*
		 * #######################################################################
		 * STEP2: The adjacencies collected by MUMmer are used by a python script to populate an
		 * undirected weighted graph.
		 * #######################################################################
		 */
		System.out.println("------------------------------------------------------------------------------------------------------------------------");
		System.out.print("Building the network...");

		String current = new java.io.File(".").getCanonicalPath();

		if (cl.hasOption("w2")) {// this weight takes in account the quality of the hits.
			process = new ProcessBuilder("python3", medusaScripts
					+ "/netcon_mummer.py", "-f" + current, "-i" + input,
					"-onetwork", "-w").start();
			errors = new BufferedReader(new InputStreamReader(
					process.getErrorStream()));
			while ((line = errors.readLine()) != null) {
				System.out.println(line);
			}
			if (process.waitFor() != 0) {
				throw new RuntimeException(
						"Error: Network construction failed.");
			}
		} else {// default weight scheme
			process = new ProcessBuilder("python3", medusaScripts
					+ "/netcon_mummer.py", "-f" + current, "-i" + input,
					"-onetwork").start();
			errors = new BufferedReader(new InputStreamReader(
					process.getErrorStream()));
			while ((line = errors.readLine()) != null) {
				System.out.println(line);
			}
			if (process.waitFor() != 0) {
				throw new RuntimeException(
						"Error: Network construction failed.");
			}
		}
		/*
		 * The network generated by the python script is stored in an object belonging to the class MyGraph.
		 */
		MyGraph graph = GexfReader.read("network");

		// Remove temporary files.
		File network = new File("network");
		network.delete();
		File dir = new File(".");
		for (String address : dir.list((File dir1, String name) -> name.toLowerCase().endsWith(".coords"))) {
			File f = new File(address);
			f.delete();
		}
		for (String address : dir.list((File dir1, String name) -> name.toLowerCase().endsWith(".delta"))) {
			File f = new File(address);
			f.delete();
		}

		if (graph.getEdges().size() == 0) {
			System.out
					.println("SORRY: No information found. Are you sure to have MUMmer packedge location in your PATH? If yes, the chosen drafts genomes don't provide sufficient information for scaffolding the target genome.");
			return;
		}
		System.out.print("done.\n");

		/*
		 * #######################################################################
		 * STEP3: Giving an order to the contigs.
		 * The order is assigned by transform the graph into a disjoint path cover.
		 * This cover is again stored in a MyGraph object.
		 * #######################################################################
		 */

		System.out.println("------------------------------------------------------------------------------------------------------------------------");
		System.out.print("Cleaning the network...");

		double factor = 1;
		for (MyEdge e : graph.getEdges()) {
			factor += e.getWeight();
		}
		MyGraph supportGraph = new MyGraph(graph);// work on a copy of the graph to keep the original network available.
		MyGraph cover = greedyCover(supportGraph, factor);

		/*
		 * #######################################################################
		 * STEP4: Give a orientation to the contigs.
		 * The cover is processed in order to assign an orientation to each contig.
		 * The paths of the cover are traversed and, for each node, a consistent orientation is assigned
		 * to its sequence.
		 * #######################################################################
		 */

		cover.cleanOrinetation();

		/*
		 * RANDOM OPTION: Give a orientation to the contigs.
		 * If the random option is present more then one candidate cover is taken in account.
		 * STEP3 and STEP4 are executed each time, and the best solution is taken.
		 */
		if (rounds > 1) {
			System.out.println("Candidate cover size: "
					+ cover.getEdges().size());
			for (int i = 2; i <= rounds; i++) {
				supportGraph = new MyGraph(graph);
				MyGraph candidateCover = greedyCoverRandom(supportGraph,
						factor);
				candidateCover.cleanOrinetation();
				System.out.println("Candidate cover size: "
						+ candidateCover.getEdges().size());
				if (candidateCover.getEdges().size() > cover.getEdges().size()) {
					cover = candidateCover;
				}
			}
			System.out.println("Best cover size: " + cover.getEdges().size());
		}
		System.out.print("done.\n");


		/*
		 * #######################################################################
		 * FINAL OUTPUT: The paths in the cover can be finally read as scaffolds.
		 * Each node is now associated to a properly oriented sequence.
		 * Default and optional output files are created.
		 * #######################################################################
		 */
		System.out.println("------------------------------------------------------------------------------------------------------------------------");

		/*
		 * DEFAULT OUTPUTS:
		 *
		 * ouptut: A textual summary of the results.
		 * output2: A .fasta file containing a sequence for each scaffold.
		 *
		 */

		LinkedHashMap<String, ProteinSequence> a = FastaReaderHelper.readFastaProteinSequence(new File(input));
		HashMap<String, String> sequences = new HashMap<>();
		for (ProteinSequence s : a.values()) {
			sequences.put(s.getOriginalHeader().split(" ")[0],
					s.getSequenceAsString());
		}

		ArrayList<String> scaffolds = cover.readScaffoldsSeq(input, sequences, distanceEstimation);
		File outputFile2 = new File(scaffoldsfilename);
		PrintWriter writerOutput2 = new PrintWriter(new FileWriter(outputFile2));
		int j = 1;
		for (String s : scaffolds) {
			writerOutput2.println(">Scaffold_" + j);
			writerOutput2.println(s);
			j++;
		}

		for (MyNode n : cover.getNodes()) {
			if (n.getDegree() == 0) {
				writerOutput2.println(">Scaffold_" + j);
				writerOutput2.println(sequences.get(n.getId()));
				j++;
			}
		}

		writerOutput2.flush();
		System.out.println("Scaffolds File saved: " + outputFile2);

		System.out.println("------------------------------------------------------------------------------------------------------------------------");

		File outputFile = new File(input + "_SUMMARY");
		PrintWriter writerOutput = new PrintWriter(new FileWriter(outputFile));
		writerOutput.write("Target genome: " + input + "\n");

		writerOutput.println("\n--------------SCAFFOLDS---------------\n");

		int finalSingletons = (cover.getNodes().size() - cover.notSingletons());
		ArrayList<String> paths = cover.subPaths();
		int totalLength = computeLenght(paths);

		int numberOfScaffolds = paths.size() + finalSingletons;

		System.out.println("Number of scaffolds: " + numberOfScaffolds
				+ " (singletons = " + finalSingletons
				+ ", multi-contig scaffold = " + paths.size() + ") \nfrom "
				+ cover.getNodes().size() + " initial fragments.");
		System.out.println("Total length of the jointed fragments: "
				+ totalLength);
		double in50 = n50avaluation(scaffoldsfilename);

		writerOutput.println("Number of scaffolds: " + numberOfScaffolds
				+ " (singletons = " + finalSingletons
				+ ", multi-contig scaffold = " + paths.size() + ") \nfrom "
				+ cover.getNodes().size() + " initial fragments.");
		writerOutput.println("Total length of the jointed fragments: "
				+ totalLength);
		writerOutput.println("Computing N50 on " + numberOfScaffolds + " sequences.");
		writerOutput.println("N50: " + in50);
		writerOutput.println("----------------------");
		writerOutput.flush();
		System.out.println("Summary File saved: " + outputFile);

		/*
		 * OPTIONAL OUTPUT:
		 * The network and the cover are outputted in .gexf format.
		 */

		if (cl.hasOption("gexf")) {
			System.out
					.println("------------------------------------------------------------------------------------------------------------------------");
			GexfWriter.write(graph, input + "_network.gexf");
			GexfWriter.write(cover, input + "_cover.gexf");
		}
		if (cl.hasOption("d")) {
			System.out
					.println("------------------------------------------------------------------------------------------------------------------------");
			MyGraph.writeDistanceFile(cover, input + "_distanceTable");
		}
	}


	/*
	 * #######################################################################
	 * #######################################################################
	 * ADDITIONAL METHODS.
	 * #######################################################################
	 */

	private MyGraph greedyCoverRandom(MyGraph network, double factor) {
		ArrayList<MyEdge> candidateEdges = new ArrayList<MyEdge>(
				network.getEdges());
		for (MyEdge edge : candidateEdges) {
			double w = edge.getWeight() + factor;
			edge.setWeight(w);

		}
		ArrayList<MyEdge> chosenEdges = new ArrayList<MyEdge>();
		MyGraph cover = new MyGraph(network.getNodes(), chosenEdges);
		for (MyNode n : cover.getNodes()) {
			n.setAdj(new ArrayList<MyNode>());
		}
		HashMap<MyNode, MyNode> twins = new HashMap<MyNode, MyNode>();
		for (MyNode n : network.getNodes()) {
			twins.put(n, n);
		}
		IdComparator comparatorId = new IdComparator();
		weightComparator comparatorW = new weightComparator();
		Collections.sort(candidateEdges, comparatorId);// ID sorting
		Collections.shuffle(candidateEdges);
		Collections.sort(candidateEdges, comparatorW);// weight sorting
		Collections.reverse(candidateEdges);
		while (!candidateEdges.isEmpty()) {
			MyEdge candidate = candidateEdges.get(0);
			MyNode source = candidate.getSource();
			MyNode target = candidate.getTarget();
			if (twins.get(source) == target) {
				candidateEdges.remove(candidate);
			} else {
				cover.addEdge(candidate);
				MyNode ps = twins.get(source);
				MyNode pt = twins.get(target);
				twins.put(ps, pt);
				twins.put(pt, ps);
				if (cover.nodeFromId(target.getId()).getDegree() > 1) {
					ArrayList<MyEdge> a = network.inoutEdges(target);
					candidateEdges.removeAll(a);
				} else {
					candidateEdges.remove(candidate);
				}
				if (cover.nodeFromId(source.getId()).getDegree() > 1) {
					ArrayList<MyEdge> b = network.inoutEdges(source);
					candidateEdges.removeAll(b);
				} else {
					candidateEdges.remove(candidate);
				}
			}

		}
		return cover;

	}

	private static MyGraph greedyCover(MyGraph network, double factor) {
		ArrayList<MyEdge> candidateEdges = new ArrayList<MyEdge>(
				network.getEdges());
		for (MyEdge edge : candidateEdges) {
			double w = edge.getWeight() + factor;
			edge.setWeight(w);

		}
		ArrayList<MyEdge> chosenEdges = new ArrayList<MyEdge>();
		MyGraph cover = new MyGraph(network.getNodes(), chosenEdges);
		for (MyNode n : cover.getNodes()) {
			n.setAdj(new ArrayList<MyNode>());
		}
		HashMap<MyNode, MyNode> twins = new HashMap<MyNode, MyNode>();
		for (MyNode n : network.getNodes()) {
			twins.put(n, n);
		}
		IdComparator comparatorId = new IdComparator();
		Collections.sort(candidateEdges, comparatorId);
		weightComparator comparatorW = new weightComparator();
		Collections.sort(candidateEdges, comparatorW);
		Collections.reverse(candidateEdges);

		for (int i = 0; i < candidateEdges.size() - 1; i++) {
			if (candidateEdges.get(i).getWeight() < candidateEdges.get(i + 1)
					.getWeight()) {
				System.out.println("ERROR: "
						+ candidateEdges.get(i).toStringVerbose() + " > "
						+ candidateEdges.get(i + 1).toStringVerbose());
			} else if (candidateEdges.get(i).getWeight() == candidateEdges.get(
					i + 1).getWeight()) {

			} else if (candidateEdges.get(i).getWeight() > candidateEdges.get(
					i + 1).getWeight()) {

			}
		}


		while (!candidateEdges.isEmpty()) {
			MyEdge candidate = candidateEdges.get(0);
			MyNode source = candidate.getSource();
			MyNode target = candidate.getTarget();
			if (twins.get(source) == target) {
				candidateEdges.remove(candidate);
			} else {
				cover.addEdge(candidate);
				MyNode ps = twins.get(source);
				MyNode pt = twins.get(target);
				twins.put(ps, pt);
				twins.put(pt, ps);
				if (cover.nodeFromId(target.getId()).getDegree() > 1) {
					ArrayList<MyEdge> a = network.inoutEdges(target);
					candidateEdges.removeAll(a);
				} else {
					candidateEdges.remove(candidate);
				}
				if (cover.nodeFromId(source.getId()).getDegree() > 1) {
					ArrayList<MyEdge> b = network.inoutEdges(source);
					candidateEdges.removeAll(b);
				} else {
					candidateEdges.remove(candidate);
				}
			}

		}
		return cover;
	}

	/*
	 * N50 EVALUATOR:
	 * This method read a .fasta file and  write on console three values.
	 * These three values corresponds to the N50 statistic for the given set of sequences.
	 */
	private void n50avaluation(CommandLine cl) throws Exception {
		ArrayList<Integer> lenghts = new ArrayList<>();
		LinkedHashMap<String, ProteinSequence> a = FastaReaderHelper
				.readFastaProteinSequence(new File(cl.getOptionValue("n50")));
		for (Entry<String, ProteinSequence> entry : a.entrySet()) {
			int l = entry.getValue().getLength();
			lenghts.add(l);
		}
		System.out.println("Number of sequences in the file: " + lenghts.size());
		System.out.println("N50: " + N50.n50(lenghts));
		System.out.println("----------------------");

	}

	private double n50avaluation(String scaffoldsfilename) throws Exception {
		ArrayList<Integer> lenghts = new ArrayList<>();
		LinkedHashMap<String, ProteinSequence> a = FastaReaderHelper
				.readFastaProteinSequence(new File(scaffoldsfilename));
		for (Entry<String, ProteinSequence> entry : a.entrySet()) {
			int l = entry.getValue().getLength();
			lenghts.add(l);
		}
		System.out.println("Computing N50 on " + lenghts.size()+ " sequences.");
		System.out.println("N50: " + N50.n50(lenghts));
		System.out.println("----------------------");

		return N50.n50(lenghts);
	}
}