package cloudBurst; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.Writer; import org.apache.hadoop.mapred.JobConf; public class ConvertFastaForCloud { private static final FastaRecord record = new FastaRecord(); public static int min_seq_len = Integer.MAX_VALUE; public static int max_seq_len = 0; public static int min(int a, int b) { if (a < b) return a; return b; } public static int max(int a, int b) { if (a > b) return a; return b; } private static IntWritable iw = new IntWritable(); public static void saveSequence(int id, StringBuilder sequence, Writer writer) throws IOException { int fulllength = sequence.length(); int maxchunk = 65535; if (fulllength < min_seq_len) { min_seq_len = fulllength; } if (fulllength > max_seq_len) { max_seq_len = fulllength; } if (fulllength > 100) { System.out.println("In " + id + "... " + fulllength + "bp"); } int offset = 0; int numchunks = 0; while(offset < fulllength) { numchunks++; int end = min(offset + maxchunk, fulllength); boolean lastChunk = (end == fulllength); record.m_sequence = DNAString.stringToBytes(sequence.substring(offset, end)); record.m_offset = offset; record.m_lastChunk = lastChunk; iw.set(id); writer.append(iw, record.toBytes()); if (end == fulllength) { offset = fulllength; } else { offset = end - cloudBurst.CloudBurst.CHUNK_OVERLAP; } } if (numchunks > 1) { System.out.println(" " + numchunks + " chunks"); } } public static void convertFile(String infile, SequenceFile.Writer writer) throws IOException { String header = ""; StringBuilder sequence = null; int count = 0; try { BufferedReader data = new BufferedReader(new InputStreamReader(new FileInputStream(infile))); String mapfile = infile; mapfile += ".map"; FileWriter fstream = new FileWriter(mapfile); BufferedWriter out = new BufferedWriter(fstream); String line; while ((line = data.readLine()) != null) { line.trim(); if (line.charAt(0) == '>') { if (count > 0) { saveSequence(count, sequence, writer); } sequence = new StringBuilder(); header = line.substring(1); // skip the > count++; out.write(count + " " + header + "\n"); } else { sequence.append(line.toUpperCase()); } } saveSequence(count, sequence, writer); out.close(); } catch (FileNotFoundException e) { System.err.println("Can't open " + infile); e.printStackTrace(); System.exit(1); } System.err.println("Processed " + count + " sequences"); } /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: ConvertFastaForCloud file.fa outfile.br"); System.exit(-1); } String infile = args[0]; String outfile = args[1]; System.err.println("Converting " + infile + " into " + outfile); JobConf config = new JobConf(); SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(config), config, new Path(outfile), IntWritable.class, BytesWritable.class); convertFile(infile, writer); writer.close(); System.err.println("min_seq_len: " + min_seq_len); System.err.println("max_seq_len: " + max_seq_len); System.err.println("Using DNAString version: " + DNAString.VERSION); } };