Java Code Examples for org.apache.spark.api.java.JavaSparkContext.stop()

The following are Jave code examples for showing how to use stop() of the org.apache.spark.api.java.JavaSparkContext class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: ViraPipe   File: RepartitionFastq.java   Source Code and License Vote up 6 votes
public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
        //conf.set("spark.default.parallelism", String.valueOf(args[2]));
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));

        repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }
 
Example 2
Project: ViraPipe   File: SplitFasta.java   Source Code and License Vote up 6 votes
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}
 
Example 3
Project: spark-dependencies   File: CassandraDependenciesJob.java   Source Code and License Vote up 6 votes
public void run() {
  long microsLower = day.toInstant().toEpochMilli() * 1000;
  long microsUpper = day.plus(Period.ofDays(1)).toInstant().toEpochMilli() * 1000 - 1;

  log.info("Running Dependencies job for {}: {} ≤ Span.timestamp {}", day, microsLower, microsUpper);
  JavaSparkContext sc = new JavaSparkContext(conf);
  try {
    JavaPairRDD<String, Iterable<Span>> traces = javaFunctions(sc)
        .cassandraTable(keyspace, "traces", mapRowTo(Span.class))
        .where("start_time < ? AND start_time > ?", microsUpper, microsLower)
        .mapToPair(span -> new Tuple2<>(span.getTraceId(), span))
        .groupByKey();

    List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces);
    store(sc, dependencyLinks);
    log.info("Done, {} dependency objects created", dependencyLinks.size());
  } finally {
    sc.stop();
  }
}
 
Example 4
Project: spark-dependencies   File: ElasticsearchDependenciesJob.java   Source Code and License Vote up 5 votes
void run(String spanResource, String depResource) {
  log.info("Running Dependencies job for {}, reading from {} index, result storing to {}", day, spanResource ,depResource);
  JavaSparkContext sc = new JavaSparkContext(conf);
  try {
    JavaPairRDD<String, Iterable<Span>> traces = JavaEsSpark.esJsonRDD(sc, spanResource)
        .map(new ElasticTupleToSpan())
        .groupBy(Span::getTraceId);

    List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces);
    store(sc, dependencyLinks, depResource);
    log.info("Done, {} dependency objects created", dependencyLinks.size());
  } finally {
    sc.stop();
  }
}
 
Example 5
Project: tika-dl4j-spark-imgrec   File: TikaSpark.java   Source Code and License Vote up 4 votes
public void run() throws IOException {
    FileSystem fs = DistributedFileSystem.get(new Configuration());
    Path inpath = new Path(input);
    Path outpath = new Path(output);
    if (!fs.exists(inpath)) {
        throw new IllegalArgumentException("Input file not found: " + inpath);
    }
    if (fs.exists(outpath)) {
        throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
    }

    SparkConf conf = new SparkConf();
    conf.setMaster(sparkMaster);
    conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
    JavaSparkContext ctx = new JavaSparkContext(conf);

    //STEP1: READ
    JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
            //.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
    //STEP2: PARSE
    JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
            (PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
                Metadata md = new Metadata();
                try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
                    String content = TikaHolder.tika.parseToString(stream, md);
                    md.add("CONTENT", content);
                }
                return new Tuple2<>(rec._1(), md);
            });
    //STEP3: FORMAT
    JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
        String key = rec._1().toString();
        Metadata metadata = rec._2();
        JSONObject object = new JSONObject();
        for (String name : metadata.names()) {
            if (metadata.isMultiValued(name)) {
                JSONArray arr = new JSONArray();
                for (String val : metadata.getValues(name)) {
                    arr.add(val);
                }
                object.put(name, arr);
            } else {
                object.put(name, metadata.get(name));
            }
        }
        return key + "\t\t" + object.toJSONString();
    });
    //STEP4: SAVE
    LOG.info("Saving at " + outpath);
    outRDD.saveAsTextFile(output);
    LOG.info("Stopping");
    ctx.stop();
}
 
Example 6
Project: ViraPipe   File: InterleaveMulti.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("DecompressInterleave");
  //conf.set("spark.scheduler.mode", "FAIR");
  //conf.set("spark.scheduler.allocation.file", "/opt/cloudera/parcels/CDH-5.10.0-1.cdh5.10.0.p0.41/etc/hadoop/conf.dist/pools.xml");
  JavaSparkContext sc = new JavaSparkContext(conf);
  //sc.setLocalProperty("spark.scheduler.pool", "production");

  Options options = new Options();

  Option splitDirOpt = new Option( "out", true, "Path to output directory in hdfs." );
  Option numsplitsOpt = new Option( "splitsize", true, "Number of reads in split, depends on the size of read file, number of cores and available memory." );
  options.addOption( new Option( "decompress", "" ) );
  options.addOption( new Option( "temp", true, "" ) );
  options.addOption( new Option( "in", true, "" ) );
  options.addOption( new Option( "remtemp", "" ) );
  options.addOption( new Option( "merge", "" ) );

  options.addOption( numsplitsOpt );
  options.addOption( splitDirOpt );
  options.addOption(new Option( "help", "print this message" ));

  HelpFormatter formatter = new HelpFormatter();
  formatter.printHelp( "spark-submit <spark specific args>", options, true );

  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }
  String input = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  int splitsize = (cmd.hasOption("splitsize")==true)? Integer.parseInt(cmd.getOptionValue("splitsize")):0;
  boolean merge = cmd.hasOption("merge");
  String outpath = cmd.getOptionValue("out");

  FileSystem fs = FileSystem.get(new Configuration());
      int splitlen = splitsize*4; //FASTQ read is expressed by 4 lines
      FileStatus[] dirs = fs.listStatus(new Path(input));

      Arrays.asList(dirs).forEach(dir ->{
        if(dir.isDirectory()){
          try {
            FileStatus fst = fs.getFileStatus(new Path(input+"/"+dir.getPath().getName()+"/1.fq"));
            FileStatus fst2 = fs.getFileStatus(new Path(input+"/"+dir.getPath().getName()+"/2.fq"));

            if(merge)
              interleaveSplitFastq(fst, fst2, outpath, splitlen, sc);
            else //SAVES SEPARATE HDFS DIRECTORIES
              interleaveSplitFastq(fst, fst2, outpath+"/"+dir.getPath().getName(), splitlen, sc);
          } catch (IOException e) {
            e.printStackTrace();
          }
        }
        });

  sc.stop();

}
 
Example 7
Project: ViraPipe   File: SamToFastq.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SamToFastq");
  sc = new JavaSparkContext(conf);

  String in = args[0];
  String out = args[1];

  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());

  JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);

  fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

  sc.stop();

}
 
Example 8
Project: ViraPipe   File: MergeFastq.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("MergeFastq");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2]));

        coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }
 
Example 9
Project: ViraPipe   File: Interleave.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("Interleave");
  //conf.set("spark.scheduler.mode", "FAIR");
  //conf.set("spark.scheduler.allocation.file", "/opt/cloudera/parcels/CDH-5.10.0-1.cdh5.10.0.p0.41/etc/hadoop/conf.dist/pools.xml");
  JavaSparkContext sc = new JavaSparkContext(conf);
  //sc.setLocalProperty("spark.scheduler.pool", "production");

  Options options = new Options();
  Option pairedOpt = new Option( "paired", "Split paired end reads to separate folders, does not interleave." );
  Option intOpt = new Option( "singlesplit", "" );
  options.addOption( new Option( "decompress", "" ) );

  options.addOption( pairedOpt );
  options.addOption( intOpt );
  options.addOption(new Option( "help", "print this message" ));

  HelpFormatter formatter = new HelpFormatter();
  formatter.printHelp( "spark-submit <spark specific args>", options, true );

  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String fastq = args[0];
  String fastq2 = args[1];
  String outdir = args[2];
  int splitsize = Integer.valueOf(args[3]);
  boolean paired = cmd.hasOption("paired");
  boolean singlesplit = cmd.hasOption("singlesplit");
  boolean decompress = cmd.hasOption("decompress");

  String outdir2 = null;
  if(paired)
    outdir2 = outdir+"2";

  FileSystem fs = FileSystem.get(new Configuration());
  if(decompress){
    decompress(fs, fastq, "temp1.fq");
    decompress(fs, fastq2, "temp2.fq");

    fastq = "temp1.fq";
    fastq2 = "temp2.fq";

  }

    //Count split positions
    int splitlen = splitsize*4; //FASTQ read is expressed by 4 lines

    if(singlesplit){
      FileStatus fstatus = fs.getFileStatus(new Path(fastq));
      splitFastq(fstatus, fastq, outdir, splitlen, sc);
      if(paired){
        FileStatus fstatus2 = fs.getFileStatus(new Path(fastq2));
        splitFastq(fstatus2, fastq2, outdir2, splitlen, sc);
      }
    }else{
      FileStatus fst = fs.getFileStatus(new Path(fastq));
      FileStatus fst2 = fs.getFileStatus(new Path(fastq2));

      interleaveSplitFastq(fst, fst2, outdir, splitlen, sc);
    }

  if(decompress){
    fs.delete(new Path("temp1.fq"), false);
    fs.delete(new Path("temp2.fq"), false);
  }

  sc.stop();

}
 
Example 10
Project: ViraPipe   File: RenameContigsUniq.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", true,"Divide or merge to n partitions" ) );
    options.addOption(new Option( "fa", true, "Include only files with extension given " ));
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        cmd = parser.parse( options, args );
    }
    catch( ParseException exp ) {
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd;
    if(fastaonly!=null)
        rdd = sc.textFile(in+"/*."+fastaonly);
    else
        rdd = sc.textFile(in); //take whole directory as input

    JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{

        String[] fseq = fasta.trim().split("\n");
        String id = fseq[0].split(" ")[0];

        //Give unique id for sequence
        String seq_id = id+"_"+UUID.randomUUID().toString();
        String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");

        return ">"+seq_id+"\n"+seq;
    });

    if(partitions!=null)
        crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
    else
        crdd.saveAsTextFile(out);

    sc.stop();
}
 
Example 11
Project: ViraPipe   File: SQLQueryBAM.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
Example 12
Project: spark-traffic   File: Application.java   Source Code and License Vote up 4 votes
public static void main(String[] args) {
        boolean isLocal = false;

        final String master = isLocal ? "local[4]" : "spark://10.128.184.199:7077";
        final String csv = isLocal ? "Z:/RCS_SP1/RAW_DATA_MORE/2016_03/TAXI/TAXI_20160301.csv" : "/pi_nj_57/RCS_SP1/RAW_DATA_MORE/2016_03/TAXI/TAXI_20160301.csv";
        final String appName = "SpeedCalculator";

        Calculator calculator = new Calculator();

        SparkConf conf = new SparkConf()
                .set("spark.executor.memory", "4G")
                .set("spark.submit.deployMode", "cluster")
                .setMaster("spark://10.128.184.199:7077")
                .setJars(new String[]{"C:\\Users\\i321761\\Desktop\\git\\github.wdf.sap.corp\\i321761\\hadoop-sample\\target\\hadoopsample-1.0-SNAPSHOT.jar"});

        JavaSparkContext sc = new JavaSparkContext(master, appName, conf);
//        JavaRDD<String> rdd = sc.textFile(csv, 2);
        JavaRDD<String> rdd = sc.parallelize(Arrays.asList("abc", "def"));
        long start = System.currentTimeMillis();
        System.out.println("Count Start ....");

        // Convert csv string to taxi point structure and remove invalid records
        JavaRDD<ITaxiMonitor.TaxiPoint> taxiPointRDD = rdd.map(line -> TaxiPointUtil.parseTaxiPoint(line))
                .filter(point -> point != null && !point.receiveTime.isEmpty() && point.receiveTime.contains(" 08:"));

        JavaPairRDD<Long, List<ITaxiMonitor.TaxiPoint>> slotsIn5 = taxiPointRDD
                .keyBy(point -> (DateTimeUtil.parseToMillSecond(point.receiveTime, "UTC+8") / 300000) * 300000)
                .combineByKey(
                        // 收到每个key的第一条记录时的初始化工作
                        v -> {
                            List<ITaxiMonitor.TaxiPoint> points = new ArrayList();
                            points.add(v);
                            return points;
                        },
            
                        // 对于某个key,收到新的记录时的操作
                        (c, v) -> {
                            c.add(v);
                            return c;
                        },
            
                        // 一个key的集合可能分布在多个task上,如何合并同一个key的操作
                        (c1, c2) -> {
                            c1.addAll(c2);
                            return c1;
                        }
                )
                .sortByKey();
        // 一个key代表5分钟的交通数据集合,对每个5分钟的集合调用计算接口计算出交通速度
        slotsIn5.map(slot -> calculator.execute(slot._2(), slot._1(), slot._1()))
                .collect().forEach(speedResult -> {
                    speedResult.getTimedEdgeSpeeds().forEach(timedEdgeSpeeds -> {
                        long t = DateTimeUtil.parseToMillSecond(timedEdgeSpeeds.timestamp, "UTC+0");
                        timedEdgeSpeeds.edgeSpeeds.forEach(speed -> System.out.println(" * EDGE_SPEED: " + TaxiPointUtil.formatEdgeSpeed(t, speed, ",")));
                    });
                });

        slotsIn5.take(10)
                .forEach(slot -> System.out.println("slot: " + slot._1() + ", " + DateTimeUtil.formatToUTC(slot._1()) + ", count: " + slot._2().size()));
//                .foreach(slot -> System.out.println("slot: " + DateTimeUtil.formatToUTC(slot._1()) + ", count" + slot._2().size()));

        sc.stop();
    }