Java Code Examples for org.apache.spark.SparkConf

The following are top voted examples for showing how to use org.apache.spark.SparkConf. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: Apache-Spark-2x-for-Java-Developers   File: S3Example.java   Source Code and License 17 votes vote down vote up
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
		JavaSparkContext jsc=new JavaSparkContext(conf);
		//jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId");
		//jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey");
		
		
		System.out.println(System.getenv("AWS_ACCESS_KEY_ID"));
		JavaRDD<String> textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv");
		
//		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt");
		
		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt");
	}
 
Example 2
Project: kafka-sandbox   File: SparkStringConsumer.java   Source Code and License 14 votes vote down vote up
public static void main(String[] args) {

        SparkConf conf = new SparkConf()
                .setAppName("kafka-sandbox")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));

        Set<String> topics = Collections.singleton("mytopic");
        Map<String, String> kafkaParams = new HashMap<>();
        kafkaParams.put("metadata.broker.list", "localhost:9092");

        JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
                String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

        directKafkaStream.foreachRDD(rdd -> {
            System.out.println("--- New RDD with " + rdd.partitions().size()
                    + " partitions and " + rdd.count() + " records");
            rdd.foreach(record -> System.out.println(record._2));
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example 3
Project: big-data-benchmark   File: SparkWordCount.java   Source Code and License 12 votes vote down vote up
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
Example 4
Project: gcp   File: Spark4KafkaNew.java   Source Code and License 11 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Kafka-New");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    
    JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
        jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
        Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT),
        Collections.singleton(EXAMPLE_TOPIC));

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 5
Project: gcp   File: Spark2Streaming.java   Source Code and License 10 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Streaming");
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    //JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
    //textFileStream process lines of files, so xml has to be 1 line to work //alternative below

    JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
    Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
    rddQueue.add(files);
    JavaDStream<String> records = jsc.queueStream(rddQueue);

    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 6
Project: stonk   File: SparkHDFSTest.java   Source Code and License 8 votes vote down vote up
@Test
public void test() {
    String hdfsPath = "hdfs://10.196.83.90:9000/stonk/spark/aa/spark-task--aa-b5x59zpv/out3";

    SparkConf conf = new SparkConf().setAppName("111").setMaster("local[3]");
    JavaSparkContext context = new JavaSparkContext(conf);
    JavaRDD<String> rdd = context.textFile(hdfsPath);
    rdd.foreach((str) -> System.out.println(str));
}
 
Example 7
Project: gcp   File: Spark6BigQuery.java   Source Code and License 8 votes vote down vote up
public static void main(String[] args) throws InterruptedException, IOException {
  SparkConf sc = new SparkConf().setAppName("POC-BigQuery");
  
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(60000))) {
    JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
        jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
        Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT), Collections.singleton(EXAMPLE_TOPIC));

    Configuration conf = new Configuration();
    BigQueryConfiguration.configureBigQueryOutput(conf, BQ_EXAMPLE_TABLE, BQ_EXAMPLE_SCHEMA);
    conf.set("mapreduce.job.outputformat.class", BigQueryOutputFormat.class.getName());

    JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
    records.foreachRDD(rdd -> {
      System.out.printf("Amount of XMLs: %d\n", rdd.count());
      long time = System.currentTimeMillis();
      rdd.mapToPair(new PrepToBQ()).saveAsNewAPIHadoopDataset(conf);
      System.out.printf("Sent to BQ in %fs\n", (System.currentTimeMillis()-time)/1000f);
    });
    
    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 8
Project: Apache-Spark-2x-for-Java-Developers   File: WordCount.java   Source Code and License 7 votes vote down vote up
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
Example 9
Project: Sempala   File: Spark.java   Source Code and License 7 votes vote down vote up
/**
 * Initializes a Spark connection. Use it afterwards for execution of Spark
 * SQL queries.
 * 
 * @param appName
 *            the name of the app that will be used with this Spark
 *            connection
 * @param database
 *            name of the database that will be used with this Spark
 *            connection
 */
public Spark(String appName, String database) {

	// TODO check what will happen if there is already in use the same app
	// name
	this.sparkConfiguration = new SparkConf().setAppName(appName);
	this.javaContext = new JavaSparkContext(sparkConfiguration);
	this.hiveContext = new HiveContext(javaContext);
	// TODO check what kind of exception can be thrown here if there is a
	// problem with spark connection

	this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
	// TODO check what kind of exception is thrown if database already

	// use the created database
	this.hiveContext.sql((String.format("USE %s", database)));
}
 
Example 10
Project: fst-bench   File: JavaSleep.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {

    if (args.length != 1) {
      System.err.println("Usage: JavaSleep <seconds>");
      System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaSleep");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    Integer parallel = sparkConf.getInt("spark.default.parallelism", ctx.defaultParallelism());
    Integer seconds = Integer.parseInt(args[0]);

    Integer[] init_val = new Integer[parallel];
    Arrays.fill(init_val, seconds);

    JavaRDD<Integer> workload = ctx.parallelize(Arrays.asList(init_val), parallel).map(new Function<Integer, Integer>() {
      @Override
      public Integer call(Integer s) throws InterruptedException {
	    Thread.sleep(s * 1000);
        return 0;
      }
    });

    List<Integer> output = workload.collect();
    ctx.stop();
  }
 
Example 11
Project: SparkToParquet   File: AppMain.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
Example 12
Project: arks-api   File: WordCount.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) 
{
 SparkConf conf = new SparkConf();
 conf.setAppName("Wordcount Background");
 conf.setMaster("local");
  
 
 JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(15));
 
 
 JavaDStream<String> lines = ssc.textFileStream("/home/rahul/DATASET");
 JavaDStream<String> words = lines.flatMap(WORDS_EXTRACTOR);
 JavaPairDStream<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
 JavaPairDStream<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
 
 counter.print();
 
 ssc.start();
 
 ssc.awaitTermination();
 

 /*JavaRDD<String> file = context.textFile("/home/rahul/Desktop/palestine.txt");
 JavaRDD<String> words = file.flatMap(WORDS_EXTRACTOR);
 JavaPairRDD<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
 JavaPairRDD<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
 counter.saveAsTextFile("/home/rahul/Desktop/wc"); 
 context.close();*/
}
 
Example 13
Project: kafka-streams-api-websockets   File: SparkConsume.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
    Map<String, Object> kafkaParams = new HashMap<>();
    kafkaParams.put("bootstrap.servers", "localhost:9092");
    kafkaParams.put("key.deserializer", StringDeserializer.class);
    kafkaParams.put("value.deserializer", StringDeserializer.class);
    kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
    kafkaParams.put("auto.offset.reset", "latest");
    kafkaParams.put("enable.auto.commit", false);

    Collection<String> topics = Arrays.asList("data-in");

    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaSpark");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(5));

    final JavaInputDStream<ConsumerRecord<String, String>> stream =
            KafkaUtils.createDirectStream(
                    streamingContext,
                    LocationStrategies.PreferConsistent(),
                    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
            );

    JavaPairDStream<String, Integer>  countOfMessageKeys = stream
            .map((ConsumerRecord<String, String> record) -> record.key())
            .mapToPair((String s) -> new Tuple2<>(s, 1))
            .reduceByKey((Integer i1, Integer i2)-> i1 + i2);

    countOfMessageKeys.print();

    // Start the computation
    streamingContext.start();
    streamingContext.awaitTermination();
}
 
Example 14
Project: ViraPipe   File: RepartitionFastq.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
        //conf.set("spark.default.parallelism", String.valueOf(args[2]));
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));

        repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }
 
Example 15
Project: ViraPipe   File: SplitFasta.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}
 
Example 16
Project: incubator-sdap-mudrod   File: SparkDriver.java   Source Code and License 6 votes vote down vote up
public SparkDriver(Properties props) {
  SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
      .set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");

  String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
  String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);

  if (!"".equals(esHost)) {
    conf.set("es.nodes", esHost);
  }

  if (!"".equals(esPort)) {
    conf.set("es.port", esPort);
  }

  conf.set("spark.serializer", KryoSerializer.class.getName());
  conf.set("es.batch.size.entries", "1500");

  sc = new JavaSparkContext(conf);
  sqlContext = new SQLContext(sc);
}
 
Example 17
Project: gcp   File: Spark8Organized.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws InterruptedException, IOException, JAXBException {
  SparkConf sc = new SparkConf().setAppName("Receiving-KafkaToBQ");

  try (JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(60000))) {

    JavaPairDStream<String, String> stream = new KafkaInputWithOffsets(
        KAFKA_HOST_PORT, EXAMPLE_TOPIC, ZOOKEEPER_HOST, ZK_PATH).createResumableStream(jsc);

    stream.foreachRDD(IdleStop.create(jsc, 2, "XMLs count: %d\n"));

    stream
        .mapToPair(parseXml())
        .filter(t -> t != null)
        .mapToPair(prepToBq())
        .foreachRDD(BigQueryHelper.outputTo(BQ_EXAMPLE_TABLE, BQ_EXAMPLE_SCHEMA));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 18
Project: spark-dependencies   File: ElasticsearchDependenciesJob.java   Source Code and License 6 votes vote down vote up
ElasticsearchDependenciesJob(Builder builder) {
  this.day = builder.day;
  this.conf = new SparkConf(true).setMaster(builder.sparkMaster).setAppName(getClass().getName());
  if (builder.jars != null) {
    conf.setJars(builder.jars);
  }
  if (builder.username != null) {
    conf.set("es.net.http.auth.user", builder.username);
  }
  if (builder.password != null) {
    conf.set("es.net.http.auth.pass", builder.password);
  }
  conf.set("es.nodes", builder.hosts);
  if (builder.hosts.indexOf("https") != -1) {
    conf.set("es.net.ssl", "true");
  }
  for (Map.Entry<String, String> entry : builder.sparkProperties.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
}
 
Example 19
Project: spark-dependencies   File: CassandraDependenciesJob.java   Source Code and License 6 votes vote down vote up
CassandraDependenciesJob(Builder builder) {
  this.keyspace = builder.keyspace;
  this.day = builder.day;
  SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
  df.setTimeZone(TimeZone.getTimeZone("UTC"));
  this.conf = new SparkConf(true)
      .setMaster(builder.sparkMaster)
      .setAppName(getClass().getName());
  conf.set("spark.cassandra.connection.host", parseHosts(builder.contactPoints));
  conf.set("spark.cassandra.connection.port", parsePort(builder.contactPoints));
  if (builder.localDc != null) {
    conf.set("connection.local_dc", builder.localDc);
  }
  if (builder.jars != null) {
    conf.setJars(builder.jars);
  }
  for (Map.Entry<String, String> entry : builder.sparkProperties.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
}
 
Example 20
Project: rdf2x   File: SparkContextProvider.java   Source Code and License 6 votes vote down vote up
/**
 * Provide a {@link JavaSparkContext} based on default settings
 *
 * @return a {@link JavaSparkContext} based on default settings
 */
public static JavaSparkContext provide() {
    SparkConf config = new SparkConf()
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .registerKryoClasses(getSerializableClasses());

    if (!config.contains("spark.app.name")) {
        config.setAppName("RDF2X");
    }
    if (!config.contains("spark.master")) {
        config.setMaster("local");
    }

    // set serialization registration required if you want to make sure you registered all your classes
    // some spark internal classes will need to be registered as well
    // config.set("spark.kryo.registrationRequired", "true");


    log.info("Getting Spark Context for config: \n{}", config.toDebugString());
    return new JavaSparkContext(config);
}
 
Example 21
Project: betleopard   File: LiveBetMain.java   Source Code and License 6 votes vote down vote up
private void init() throws IOException {
    final ClientConfig config = new ClientConfig();
    client = HazelcastClient.newHazelcastClient(config);

    final SparkConf conf = new SparkConf()
            .set("hazelcast.server.addresses", "127.0.0.1:5701")
            .set("hazelcast.server.groupName", "dev")
            .set("hazelcast.server.groupPass", "dev-pass")
            .set("hazelcast.spark.valueBatchingEnabled", "true")
            .set("hazelcast.spark.readBatchSize", "5000")
            .set("hazelcast.spark.writeBatchSize", "5000");

    sc = new JavaSparkContext("local", "appname", conf);

    loadHistoricalRaces();
    createRandomUsers();
    createFutureEvent();
}
 
Example 22
Project: Sempala   File: Spark.java   Source Code and License 6 votes vote down vote up
/**
 * Initializes a Spark connection. Use it afterwards for execution of Spark
 * SQL queries.
 * 
 * @param appName the name of the app that will be used with this Spark
 *            connection
 * @param database name of the database that will be used with this Spark
 *            connection
 * @param master the master URI
 */
public Spark(String appName, String database) {

	// TODO check what will happen if there is already in use the same app
	// name
	this.sparkConfiguration = new SparkConf().setAppName(appName).set("spark.io.compression.codec", "snappy");
	this.javaContext = new JavaSparkContext(sparkConfiguration);
	this.hiveContext = new HiveContext(javaContext);

	// use the created database
	this.hiveContext.sql((String.format("USE %s", database)));

	configureSparkContext();
	cacheTable();

}
 
Example 23
Project: impala-spark-jdbc-kerberos   File: ImpalaSparkJDBC.java   Source Code and License 6 votes vote down vote up
/**
 * @param args
 * @throws SQLException
 */
public static void main(String[] args) throws SQLException {
    if (args.length == 0) {
        System.out.println("Usage: ImpalaSparkJDBC <url> <tableName>");
        System.out.println("       (secure)   jdbc:impala://impala-host:21050/;AuthMech=1;KrbRealm=realm;KrbHostFQDN=krbHost;KrbServiceName=impala");
        System.out.println("       (insecure) jdbc:hive2://impala-host:21050/;auth=noSasl");
        System.exit(1);
    }

    Properties prop = new Properties();
    prop.setProperty("driver","com.cloudera.impala.jdbc41.Driver");

    System.setProperty("java.security.auth.login.config", "jaas.conf");
    System.setProperty("sun.security.jgss.debug","true");
    System.setProperty("javax.security.auth.useSubjectCredsOnly","false");

    SparkConf sparkConf = new SparkConf().setAppName("ImpalaJDBC");
    SparkContext sc = new SparkContext(sparkConf);
    SQLContext sqlContext = SQLContext.getOrCreate(sc);
    sqlContext.read().jdbc(args[0], args[1], prop).show();
}
 
Example 24
Project: ytk-learn   File: SparkTrainWorker.java   Source Code and License 6 votes vote down vote up
public SparkTrainWorker(
                        SparkConf conf,
                        String modelName,
                        String configPath,
                        String configFile,
                        String pyTransformScript,
                        boolean needPyTransform,
                        String loginName,
                        String hostName,
                        int hostPort,
                        int slaveNum,
                        int threadNum) throws Exception {
    super(modelName, configPath, configFile, pyTransformScript, needPyTransform,
            loginName, hostName, hostPort, threadNum);
    this.slaveNum = slaveNum;

    conf.set("spark.files.fetchTimeout", "3200");
    conf.set("spark.network.timeout", "3200");
    conf.set("spark.dynamicAllocation.executorIdleTimeout", "3200");
    conf.set("spark.dynamicAllocation.schedulerBacklogTimeout", "300");
    conf.set("spark.core.connection.auth.wait.timeout", "3200");
    conf.set("spark.memory.fraction", "0.01");
}
 
Example 25
Project: kinetica-connector-spark   File: SparkKineticaLoader.java   Source Code and License 6 votes vote down vote up
/**
 * Creates SparkConf using properties from properties file
 * @param propertyFile kinetica property file
 */
public static void connectSpark(String propertyFile) {

    try {
        SparkKineticaLoader.loadPropertyFile(propertyFile);
    } catch (IOException e) {
        e.printStackTrace();
        log.error("Unable to load property file", e);
        System.exit(-2);
    }

    SparkConf conf = new SparkConf()
            .setAppName(KineticaConfiguration.PROP_SPARK_APP_NAME);



    sparkSession = SparkSession
            .builder()
            .config(conf)
            .getOrCreate();

}
 
Example 26
Project: fst-bench   File: JavaTeraSort.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    if (args.length < 2) {
      System.err.println("Usage: JavaTeraSort <HDFS_INPUT> <HDFS_OUTPUT>");
      System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaTeraSort");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);
    Integer parallel = sparkConf.getInt("spark.default.parallelism", ctx.defaultParallelism());
    Integer reducer  = Integer.parseInt(IOCommon.getProperty("hibench.default.shuffle.parallelism").get());
    JavaPairRDD<String, String> words = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) throws Exception {
            return new Tuple2<String, String>(s.substring(0, 10), s.substring(10));
        }
    });


    JavaPairRDD<String, String> sorted = words.sortByKey(true, reducer);

    JavaRDD<String> result = sorted.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> e) throws Exception {
            return e._1() + e._2();
        }
    });

    result.saveAsTextFile(args[1]);

    ctx.stop();
  }
 
Example 27
Project: nats-connector-spark   File: SparkToStandardNatsConnectorTest.java   Source Code and License 6 votes vote down vote up
/**
 * @throws java.lang.Exception
 */
@BeforeClass
public static void setUpBeforeClass() throws Exception {
	// Enable tracing for debugging as necessary.
	Level level = Level.WARN;
	UnitTestUtilities.setLogLevel(SparkToNatsConnector.class, level);
	UnitTestUtilities.setLogLevel(SparkToStandardNatsConnectorImpl.class, level);
	UnitTestUtilities.setLogLevel(SparkToStandardNatsConnectorTest.class, level);
	UnitTestUtilities.setLogLevel(TestClient.class, level);
	UnitTestUtilities.setLogLevel("org.apache.spark", level);
	UnitTestUtilities.setLogLevel("org.spark-project", level);
	
	logger = LoggerFactory.getLogger(SparkToStandardNatsConnectorTest.class);       

	SparkConf sparkConf = new SparkConf().setAppName("My Spark Job").setMaster("local[2]").set("spark.driver.host", "localhost"); // https://issues.apache.org/jira/browse/
	sc = new JavaSparkContext(sparkConf);

	UnitTestUtilities.startDefaultServer();
}
 
Example 28
Project: net.jgp.labs.spark   File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java   Source Code and License 6 votes vote down vote up
private void start() {
	// Create a local StreamingContext with two working thread and batch interval of
	// 1 second
	SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
	JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

	JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());

	msgDataStream.print();
	// Create JavaRDD<Row>
	msgDataStream.foreachRDD(new RowProcessor());	

	jssc.start();
	try {
		jssc.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 29
Project: splice-community-sample-code   File: SparkStreamingMQTT.java   Source Code and License 6 votes vote down vote up
/**
 * This will start the spark stream that is reading from the MQTT queue
 *
 * @param broker     - MQTT broker url
 * @param topic      - MQTT topic name
 * @param numSeconds - Number of seconds between batch size
 */
public void processMQTT(final String broker, final String topic, final int numSeconds) {

    LOG.info("************ SparkStreamingMQTTOutside.processMQTT start");

    // Create the spark application and set the name to MQTT
    SparkConf sparkConf = new SparkConf().setAppName("MQTT");

    // Create the spark streaming context with a 'numSeconds' second batch size
    jssc = new JavaStreamingContext(sparkConf, Durations.seconds(numSeconds));
    jssc.checkpoint(checkpointDirectory);

    LOG.info("************ SparkStreamingMQTTOutside.processMQTT about to read the MQTTUtils.createStream");
    //2. MQTTUtils to collect MQTT messages
    JavaReceiverInputDStream<String> messages = MQTTUtils.createStream(jssc, broker, topic);

    LOG.info("************ SparkStreamingMQTTOutside.processMQTT about to do foreachRDD");
    //process the messages on the queue and save them to the database
    messages.foreachRDD(new SaveRDD());

    LOG.info("************ SparkStreamingMQTTOutside.processMQTT prior to context.strt");
    // Start the context
    jssc.start();
    jssc.awaitTermination();
}
 
Example 30
Project: Sparkathon   File: DAGView.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Big Apple").setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<String> lines = sc.textFile("src/main/resources/compressed.gz");
    JavaRDD<String[]> filteredRDD = lines.map(s -> s.split(" ")).filter(words -> words.length > 0);

    System.out.println(filteredRDD.toDebugString());

    JavaPairRDD<String, Integer> stringIntegerJavaPairRDD =
            filteredRDD.mapToPair(words -> new Tuple2<>(words[0], 1));
    JavaPairRDD<String, Integer> stringIntegerJavaPairRDD1 =
            stringIntegerJavaPairRDD.reduceByKey((a, b) -> a + b);

    System.out.println(stringIntegerJavaPairRDD1.toDebugString());

}
 
Example 31
Project: Sparkathon   File: KVTuple.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Big Apple").setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<String> lines = sc.textFile("src/main/resources/compressed.gz");
    JavaPairRDD<String, Integer> pairs = lines.mapToPair(s -> new Tuple2(s, 1));
    JavaPairRDD<String, Integer> counts = pairs.reduceByKey((a, b) -> a + b);

    counts.foreach(x-> System.out.println(x));
    JavaRDD<Integer> integerJavaRDD = counts.map(x -> x._2 + 10);
    integerJavaRDD.foreach(x-> System.out.println(x));

    //TODO what does counts.sortByKey do?
    // TODO what would counts.collect do?

}
 
Example 32
Project: Sparkathon   File: PassingFunctions.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Big Apple").setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    class GetLength implements Function<String, Integer> {
        public Integer call(String s) {
            return s.length();
        }
    }

    class Sum implements Function2<Integer, Integer, Integer> {
        public Integer call(Integer a, Integer b) {
            return a + b;
        }
    }

    JavaRDD<String> lines = sc.textFile("src/main/resources/compressed.gz");
    JavaRDD<Integer> lineLengths = lines.map(new GetLength());
    // Printing an RDD
    lineLengths.foreach(x-> System.out.println(x));

    int totalLength = lineLengths.reduce(new Sum());

    System.out.println(totalLength);
}
 
Example 33
Project: gspark   File: JavaStatusTrackerDemo.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  SparkConf sparkConf = new SparkConf().setAppName(APP_NAME);
  final JavaSparkContext sc = new JavaSparkContext(sparkConf);

  // Example of implementing a progress reporter for a simple job.
  JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 5).map(
      new IdentityWithDelay<Integer>());
  JavaFutureAction<List<Integer>> jobFuture = rdd.collectAsync();
  while (!jobFuture.isDone()) {
    Thread.sleep(1000);  // 1 second
    List<Integer> jobIds = jobFuture.jobIds();
    if (jobIds.isEmpty()) {
      continue;
    }
    int currentJobId = jobIds.get(jobIds.size() - 1);
    SparkJobInfo jobInfo = sc.statusTracker().getJobInfo(currentJobId);
    SparkStageInfo stageInfo = sc.statusTracker().getStageInfo(jobInfo.stageIds()[0]);
    System.out.println(stageInfo.numTasks() + " tasks total: " + stageInfo.numActiveTasks() +
        " active, " + stageInfo.numCompletedTasks() + " complete");
  }

  System.out.println("Job results are: " + jobFuture.get());
  sc.stop();
}
 
Example 34
Project: DataVec   File: BaseSparkTest.java   Source Code and License 6 votes vote down vote up
public synchronized JavaSparkContext getContext() {
    if (sc != null)
        return sc;

    SparkConf sparkConf = new SparkConf().setMaster("local[*]").set("spark.driver.host", "localhost")
                    .set("spark.driverEnv.SPARK_LOCAL_IP", "127.0.0.1")
                    .set("spark.executorEnv.SPARK_LOCAL_IP", "127.0.0.1").setAppName("sparktest");
    if (useKryo()) {
        sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    }


    sc = new JavaSparkContext(sparkConf);

    return sc;
}
 
Example 35
Project: net.jgp.labs.spark   File: StreamingIngestionFileSystemTextFileApp.java   Source Code and License 6 votes vote down vote up
private void start() {
	// Create a local StreamingContext with two working thread and batch interval of
	// 1 second
	SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount");
	JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

	JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
	msgDataStream.print();

	jssc.start();
	try {
		jssc.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 36
Project: mudrod   File: SparkDriver.java   Source Code and License 6 votes vote down vote up
public SparkDriver(Properties props) {
  SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
      .set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");

  String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
  String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);

  if (!"".equals(esHost)) {
    conf.set("es.nodes", esHost);
  }

  if (!"".equals(esPort)) {
    conf.set("es.port", esPort);
  }

  conf.set("spark.serializer", KryoSerializer.class.getName());
  conf.set("es.batch.size.entries", "1500");

  sc = new JavaSparkContext(conf);
  sqlContext = new SQLContext(sc);
}
 
Example 37
Project: AdaptDB   File: SparkQuery.java   Source Code and License 6 votes vote down vote up
public SparkQuery(ConfUtils config) {
	this.cfg = config;
	SparkConf sconf = new SparkConf().setMaster(cfg.getSPARK_MASTER())
			.setAppName(this.getClass().getName())
			.setSparkHome(cfg.getSPARK_HOME())
			.setJars(new String[] { cfg.getSPARK_APPLICATION_JAR() })
			.set("spark.hadoop.cloneConf", "false")
			.set("spark.executor.memory", cfg.getSPARK_EXECUTOR_MEMORY())
			.set("spark.driver.memory", cfg.getSPARK_DRIVER_MEMORY())
			.set("spark.task.cpus", cfg.getSPARK_TASK_CPUS());

	ctx = new JavaSparkContext(sconf);
	ctx.hadoopConfiguration().setBoolean(
			FileInputFormat.INPUT_DIR_RECURSIVE, true);
	ctx.hadoopConfiguration().set("fs.hdfs.impl",
			org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
	queryConf = new SparkQueryConf(ctx.hadoopConfiguration());
}
 
Example 38
Project: LiteGraph   File: SparkGraphComputer.java   Source Code and License 6 votes vote down vote up
/**
 * When using a persistent context the running Context's configuration will override a passed
 * in configuration. Spark allows us to override these inherited properties via
 * SparkContext.setLocalProperty
 */
private void updateLocalConfiguration(final JavaSparkContext sparkContext, final SparkConf sparkConfiguration) {
    /*
     * While we could enumerate over the entire SparkConfiguration and copy into the Thread
     * Local properties of the Spark Context this could cause adverse effects with future
     * versions of Spark. Since the api for setting multiple local properties at once is
     * restricted as private, we will only set those properties we know can effect SparkGraphComputer
     * Execution rather than applying the entire configuration.
     */
    final String[] validPropertyNames = {
            "spark.job.description",
            "spark.jobGroup.id",
            "spark.job.interruptOnCancel",
            "spark.scheduler.pool"
    };

    for (String propertyName : validPropertyNames) {
        if (sparkConfiguration.contains(propertyName)) {
            String propertyValue = sparkConfiguration.get(propertyName);
            this.logger.info("Setting Thread Local SparkContext Property - "
                    + propertyName + " : " + propertyValue);

            sparkContext.setLocalProperty(propertyName, sparkConfiguration.get(propertyName));
        }
    }
}
 
Example 39
Project: arks-api   File: JavaWordCount.java   Source Code and License 6 votes vote down vote up
public void runWordCount(String[] args) {
    if (args.length < 1) {
        System.err
                .println("Please provide the input file full path as argument");
        System.exit(0);
    }

    SparkConf conf = new SparkConf().setAppName(
            "org.sparkexample.WordCount").setMaster("local");
    JavaSparkContext context = new JavaSparkContext(conf);

    JavaRDD<String> file = context.textFile(args[0]);
    JavaRDD<String> words = file.flatMap(WORDS_EXTRACTOR);
    JavaPairRDD<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
    JavaPairRDD<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);

    counter.saveAsTextFile(args[1]);
}
 
Example 40
Project: ParquetUtils   File: ParquetRepartTest.java   Source Code and License 5 votes vote down vote up
@BeforeClass
public static void createContext() throws IOException {

	Configuration hdfsConfig = HDFSUtils.getConfiguration();
	SparkConf config = new SparkConf();
	config.setMaster("local[*]");
	config.setAppName("my JUnit running Spark");
	sc = new JavaSparkContext(config);
	fileSystem = FileSystem.get(hdfsConfig);
	sqlContext = new SQLContext(sc);
	engine = new ParquetRepartEngine(fileSystem, sqlContext);
}