package cloudBurst; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import cloudBurst.MerReduce.MapClass; import cloudBurst.MerReduce.ReduceClass; import cloudBurst.FilterAlignments.FilterCombinerClass; import cloudBurst.FilterAlignments.FilterMapClass; import cloudBurst.FilterAlignments.FilterReduceClass; public class CloudBurst { // Make sure this number is longer than the longest read public static final int CHUNK_OVERLAP = 1024; //------------------------- alignall -------------------------- // Setup and run the hadoop job for running the alignment public static RunningJob alignall(String refpath, String qrypath, String outpath, int MIN_READ_LEN, int MAX_READ_LEN, int K, int ALLOW_DIFFERENCES, boolean FILTER_ALIGNMENTS, int NUM_MAP_TASKS, int NUM_REDUCE_TASKS, int BLOCK_SIZE, int REDUNDANCY) throws IOException, Exception { int SEED_LEN = MIN_READ_LEN / (K+1); int FLANK_LEN = MAX_READ_LEN-SEED_LEN+K; System.out.println("refath: " + refpath); System.out.println("qrypath: " + qrypath); System.out.println("outpath: " + outpath); System.out.println("MIN_READ_LEN: " + MIN_READ_LEN); System.out.println("MAX_READ_LEN: " + MAX_READ_LEN); System.out.println("K: " + K); System.out.println("SEED_LEN: " + SEED_LEN); System.out.println("FLANK_LEN: " + FLANK_LEN); System.out.println("ALLOW_DIFFERENCES: " + ALLOW_DIFFERENCES); System.out.println("FILTER_ALIGNMENTS: " + FILTER_ALIGNMENTS); System.out.println("NUM_MAP_TASKS: " + NUM_MAP_TASKS); System.out.println("NUM_REDUCE_TASKS: " + NUM_REDUCE_TASKS); System.out.println("BLOCK_SIZE: " + BLOCK_SIZE); System.out.println("REDUNDANCY: " + REDUNDANCY); JobConf conf = new JobConf(MerReduce.class); conf.setJobName("CloudBurst"); conf.setNumMapTasks(NUM_MAP_TASKS); conf.setNumReduceTasks(NUM_REDUCE_TASKS); FileInputFormat.addInputPath(conf, new Path(refpath)); FileInputFormat.addInputPath(conf, new Path(qrypath)); conf.set("refpath", refpath); conf.set("qrypath", qrypath); conf.set("MIN_READ_LEN", Integer.toString(MIN_READ_LEN)); conf.set("MAX_READ_LEN", Integer.toString(MAX_READ_LEN)); conf.set("K", Integer.toString(K)); conf.set("SEED_LEN", Integer.toString(SEED_LEN)); conf.set("FLANK_LEN", Integer.toString(FLANK_LEN)); conf.set("ALLOW_DIFFERENCES", Integer.toString(ALLOW_DIFFERENCES)); conf.set("BLOCK_SIZE", Integer.toString(BLOCK_SIZE)); conf.set("REDUNDANCY", Integer.toString(REDUNDANCY)); conf.set("FILTER_ALIGNMENTS", (FILTER_ALIGNMENTS ? "1" : "0")); conf.setMapperClass(MapClass.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setReducerClass(ReduceClass.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); Path oPath = new Path(outpath); FileOutputFormat.setOutputPath(conf, oPath); System.err.println(" Removing old results"); FileSystem.get(conf).delete(oPath); RunningJob rj = JobClient.runJob(conf); System.err.println("CloudBurst Finished"); return rj; } //------------------------- filter -------------------------- // Setup and run the hadoop job for filtering the alignments to just report unambiguous bests public static void filter(String alignpath, String outpath, int nummappers, int numreducers) throws IOException, Exception { System.out.println("NUM_FMAP_TASKS: " + nummappers); System.out.println("NUM_FREDUCE_TASKS: " + numreducers); JobConf conf = new JobConf(FilterAlignments.class); conf.setJobName("FilterAlignments"); conf.setNumMapTasks(nummappers); conf.setNumReduceTasks(numreducers); FileInputFormat.addInputPath(conf, new Path(alignpath)); conf.setMapperClass(FilterMapClass.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setCombinerClass(FilterCombinerClass.class); conf.setReducerClass(FilterReduceClass.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); Path oPath = new Path(outpath); FileOutputFormat.setOutputPath(conf, oPath); System.err.println(" Removing old results"); FileSystem.get(conf).delete(oPath); JobClient.runJob(conf); System.err.println("FilterAlignments Finished"); } //------------------------- main -------------------------- // Parse the command line options, run alignment and filtering public static void main(String[] args) throws Exception { String refpath = null; String qrypath = null; String outpath = null; int K = 0; int readlen = 0; int allowdifferences = 0; int nummappers = 1; int numreducers = 1; int numfmappers = 1; int numfreducers = 1; int blocksize = 128; int redundancy = 1; boolean filteralignments = false; int local = 0; // set to zero to use command line arguments if (local == 1) { refpath = "/user/guest/cloudburst/s_suis.br"; qrypath = "/user/guest/cloudburst/100k.br"; outpath = "/user/guest/br-results"; readlen = 36; K = 3; allowdifferences = 0; filteralignments = true; redundancy = 2; } else if (args.length < 13) { System.err.println("Usage: CloudBurst refpath qrypath outpath readlen k allowdifferences filteralignments #mappers #reduces #fmappers #freducers blocksize redundancy"); return; } else { refpath = args[0]; qrypath = args[1]; outpath = args[2]; readlen = Integer.parseInt(args[3]); K = Integer.parseInt(args[4]); allowdifferences = Integer.parseInt(args[5]); filteralignments = Integer.parseInt(args[6]) == 1; nummappers = Integer.parseInt(args[7]); numreducers = Integer.parseInt(args[8]); numfmappers = Integer.parseInt(args[9]); numfreducers = Integer.parseInt(args[10]); blocksize = Integer.parseInt(args[11]); redundancy = Integer.parseInt(args[12]); } if (redundancy < 1) { System.err.println("minimum redundancy is 1"); return; } if (readlen > CHUNK_OVERLAP) { System.err.println("Increase CHUNK_OVERLAP for " + readlen + " length reads, and reconvert fasta file"); return; } // start the timer Timer all = new Timer(); String alignpath = outpath; if (filteralignments) { alignpath += "-alignments"; } // run the alignments Timer talign = new Timer(); alignall(refpath, qrypath, alignpath, readlen, readlen, K, allowdifferences, filteralignments, nummappers, numreducers, blocksize, redundancy); System.err.println("Alignment time: " + talign.get()); // filter to report best alignments if (filteralignments) { Timer tfilter = new Timer(); filter(alignpath, outpath, numfmappers, numfreducers); System.err.println("Filtering time: " + tfilter.get()); } System.err.println("Total Running time: " + all.get()); }; }