package org.ngseq.metagenomics;

import org.apache.commons.cli.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.io.IOException;

/**
 * Created by davbzh on 2017-04-14.
 */
public class SplitFasta {

    public static void main(String[] args) throws IOException {
        Options options = new Options();
        Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
        Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
        options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
        options.addOption( pathOpt );
        options.addOption( opOpt );

        CommandLineParser parser = new BasicParser();
        CommandLine cmd = null;
        try {
            // parse the command line arguments
            cmd = parser.parse( options, args );

        }
        catch( ParseException exp ) {
            // oops, something went wrong
            System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
        }

        String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
        String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
        String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

        SparkConf conf = new SparkConf().setAppName("SplitFasta");
        JavaSparkContext sc = new JavaSparkContext(conf);
        sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

        JavaRDD<String> rdd = sc.textFile(in);
        JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

        crdd.saveAsTextFile(out);
        sc.stop();
    }
}