org.apache.spark.api.java.function.VoidFunction Java Exaples

Source File: Accumulator.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //日志输出级别
    javaSparkContext.setLogLevel("ERROR");
    //创建RDD
    JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache();

    AttackAccumulator attackAccumulator = new AttackAccumulator();
    //注册累加器
    javaSparkContext.sc().register(attackAccumulator, "attack_count");
    //生成一个随机数作为value
    JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> {
        Integer random = new Random().nextInt(10);
        return new Tuple2<>(s, s + ":" + random);
    });

    javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> {
        attackAccumulator.add(tuple2._2);
    });
    System.out.println(attackAccumulator.value());
}

Source File: Intersection.java From SparkDemo with MIT License

6 votes

static void intersection(JavaSparkContext sc) {
	List<String> datas1 = Arrays.asList("张三", "李四", "tom");
	List<String> datas2 = Arrays.asList("tom", "gim");

	/**
	 *  =====================================
	 *   |             返回两个RDD的交集                                                   |
	 *   |             Returns the intersection of two RDD                    |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
	JavaRDD<String> intersectionRDD = sc.parallelize(datas1).intersection(sc.parallelize(datas2));

	intersectionRDD.foreach(new VoidFunction<String>() {

		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

}

Source File: SampleAndTake.java From SparkDemo with MIT License

6 votes

static void sample(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> dataRDD = sc.parallelize(datas);
	
	/**
	 *  ====================================================================================================== 
	 *   |                   随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子                       |
	 *   |                   The random  sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times,  |
	 *   |                   and false indicates no return.  Fraction represents the sampling proportion;seed is the random number seed                                                               |                                                                                                                                                                                                                                           | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis());
	
	// TODO dataRDD.takeSample(false, 3);
	// TODO dataRDD.take(3)

	sampleRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer t) throws Exception {
			System.out.println(t);
		}
	});

	sc.close();
}

Source File: Distinct.java From SparkDemo with MIT License

6 votes

private static void distinct(JavaSparkContext sc) {
	List<String> datas = Arrays.asList("张三", "李四", "tom", "张三");

	 /**
	 *  ===================================
	 *   |      去重--包含shuffle操作                                                 |
	 *   |      Remove weights, including shuffle operations    |                                                                                                                                                                                                                                    | 
	 *   ===================================
	 */
	JavaRDD<String> distinctRDD = sc.parallelize(datas).distinct();
	
	distinctRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});
}

Source File: Union.java From SparkDemo with MIT License

6 votes

static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD，不去重，要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}

Source File: FlatMap.java From SparkDemo with MIT License

6 votes

private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}

Source File: InputFormatTest.java From HadoopCV with Apache License 2.0

6 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]");
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	Configuration hc = new org.apache.hadoop.conf.Configuration();
	JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc);
	
	video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() {	
		@Override
		public void call(Tuple2<Text, HBMat> tuple) throws Exception {
			HBMat image = (HBMat)tuple._2;
			System.out.print(image.getBmat().dump());
		}
	});
	
	System.out.print(video.count());
}

Source File: Cartesian.java From SparkDemo with MIT License

6 votes

private static void cartesian(JavaSparkContext sc) {
    List<String> names = Arrays.asList("张三", "李四", "王五");
    List<Integer> scores = Arrays.asList(60, 70, 80);

    JavaRDD<String> namesRDD = sc.parallelize(names);
    JavaRDD<Integer> scoreRDD = sc.parallelize(scores);

    /**
	 *  =====================================
	 *   |             两个RDD进行笛卡尔积合并                                        |
	 *   |             The two RDD are Cartesian product merging     |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
    JavaPairRDD<String, Integer> cartesianRDD = namesRDD.cartesian(scoreRDD);
    
    cartesianRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
        public void call(Tuple2<String, Integer> t) throws Exception {
            System.out.println(t._1 + "\t" + t._2());
        }
    });
}

Source File: Filter.java From SparkDemo with MIT License

6 votes

private static void filter(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> rddData = sc.parallelize(datas);
	JavaRDD<Integer> filterRDD = rddData.filter(
			// jdk1.8
			// v1 -> v1 >= 3
			new Function<Integer, Boolean>() {
				public Boolean call(Integer v) throws Exception {
					// 过滤小于4的数
					return v >= 4;
				}
			});

	filterRDD.foreach(
			// jdk1.8
			// v -> System.out.println(v)
			new VoidFunction<Integer>() {
				@Override
				public void call(Integer integer) throws Exception {
					System.out.println(integer);
				}
			});
	sc.close();
}

Source File: BroadCastParam.java From sparkResearch with Apache License 2.0

6 votes

/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}

Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) {
  partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() {
    @Override
    public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}

Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag =
      ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset =
      new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}

Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }

Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));
        SpliceSpark.setContext(ssc.sparkContext());

        SparkSession spark = SpliceSpark.getSessionUnsafe();

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

private void encryptedColumnCalc(JavaPairDStream<Long,BigInteger> encRowRDD)
{
  // Multiply the column values by colNum: emit <colNum, finalColVal>
  JavaPairDStream<Long,BigInteger> encColRDD;
  if (colMultReduceByKey)
  {
    encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions);
  }
  else
  {
    encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars));
  }

  // Update the output name, by batch number
  bVars.setOutput(outputFile + "_" + accum.numBatchesGetValue());

  // Form and write the response object
  encColRDD.repartition(1).foreachRDD((VoidFunction<JavaPairRDD<Long,BigInteger>>) rdd -> {
    rdd.foreachPartition(new FinalResponseFunction(accum, bVars));

    int maxBatchesVar = bVars.getMaxBatches();
    if (maxBatchesVar != -1 && accum.numBatchesGetValue() == maxBatchesVar)
    {
      logger.info("num batches = maxBatches = " + maxBatchesVar + "; shutting down");
      System.exit(0);
    }

  });
}

Source File: Functions.java From spark-streaming-direct-kafka with Apache License 2.0

5 votes

public static <T> VoidFunction<T> noOp() {
    return new VoidFunction<T>() {
        @Override
        public void call(T t) {
            // do nothing
        }
    };
}

Source File: AccumulatorValue.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(AccumulatorValue.class);

	// 创建累加器
	final Accumulator<Integer> accumulator = sc.accumulator(0, "My Accumulator");

	List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
	JavaRDD<Integer> listRDD = sc.parallelize(list);

	listRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer n) throws Exception {
			accumulator.add(n);
			// 不能读取，会报异常 cannot read, you will report an exception
			// System.out.println(accumulator.value());
		}
	});

	// 只能在Driver读取
	System.out.println(accumulator.value());
	
	try {
		Thread.sleep(5000*5000*5000);
		// http://192.168.68.1:4040
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	sc.close();
}

Source File: JDBCDataSource.java From SparkDemo with MIT License

5 votes

/**
 * @category 将RDD的数据存储到Mysql数据库中
 * @param sqlContext
 * @param options
 */
private static void saveToMysql( SQLContext sqlContext, Map<String, String> options) {
	options.put("url", "jdbc:mysql://192.168.68.1:3306/tourismdb");
	options.put("dbtable", "t_user");
	Dataset<Row> dataset = sqlContext.read().format("jdbc").options(options).load();

	dataset.javaRDD().foreach(new VoidFunction<Row>() {
		@Override
		public void call(Row row) throws Exception {
			String sql = "insert into t_user( name, password, phone, email,type,status,del) values("
					+ "'"+ row.getString(1) + "'," 
					+ "'"+ row.getString(2) + "'," 
					+ "'"+ row.getString(3) + "'," 
					+ "'"+ row.getString(4) + "'," 
					+ row.getInt(5)+ ","
					+ row.getInt(6)+ ","
					+ row.getInt(7)+ ")";
			System.out.println(sql);
			Class.forName("com.mysql.jdbc.Driver");
			Connection conn = null;
			Statement statement = null;
			try {
				conn = DriverManager.getConnection("jdbc:mysql://192.168.68.129:3306/sparkdemo","root","666666");
				statement = conn.createStatement();
				statement.executeUpdate(sql);
			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				if(statement!=null){
					statement.close();
				}
				if (conn!=null) {
					conn.close();
				}
			}
		}
	});
}

Source File: JDBCDataSource.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
//		SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
		JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class);
		SQLContext sqlContext = new SQLContext(sc);

		Map<String, String> options = new HashMap<String, String>();
		options.put("url", "jdbc:mysql://192.168.2.129:3306/hive");
		options.put("dbtable", "t_user");
		options.put("user", "root");
		options.put("password", "666666");

		// 加载jdbc数据配置信息 并不会立即连接数据库
		Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load();

		//		options.put("dbtable", "tb_item");
		//		DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load();

		// 读取jdbc表数据
		dataset1.javaRDD().foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row row) throws Exception {
				System.out.println(row);
			}
		});


		// 将RDD数据存储到MYSQL中
		saveToMysql( sqlContext, options);

		sc.close();
	}

Source File: Cogroup.java From SparkDemo with MIT License

5 votes

private static void cogroup(JavaSparkContext sc) {
	List<Tuple2<Integer, String>> datas1 = new ArrayList<>();
	datas1.add(new Tuple2<>(1, "苹果"));
	datas1.add(new Tuple2<>(2, "梨"));
	datas1.add(new Tuple2<>(3, "香蕉"));
	datas1.add(new Tuple2<>(4, "石榴"));

	List<Tuple2<Integer, Integer>> datas2 = new ArrayList<>();
	datas2.add(new Tuple2<>(1, 7));
	datas2.add(new Tuple2<>(2, 3));
	datas2.add(new Tuple2<>(3, 8));
	datas2.add(new Tuple2<>(4, 3));

	List<Tuple2<Integer, String>> datas3 = new ArrayList<>();
	datas3.add(new Tuple2<>(1, "7"));
	datas3.add(new Tuple2<>(2, "3"));
	datas3.add(new Tuple2<>(3, "8"));
	datas3.add(new Tuple2<>(4, "3"));
	datas3.add(new Tuple2<>(4, "4"));
	datas3.add(new Tuple2<>(4, "5"));
	datas3.add(new Tuple2<>(4, "6"));

	/**
	 *   ===================================================== =========================
	 *   |    Cogroup: groups the elements in the same key in each RDD into a collection of KV elements in each RDD.                   |
	 *   |    Unlike reduceByKey, the elements of the same key are merged in the two RDD.                                                                   | 
	 *   ===============================================================================
	 */
	sc.parallelizePairs(datas1).cogroup(sc.parallelizePairs(datas2), sc.parallelizePairs(datas3)).foreach(
			new VoidFunction<Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>> t)
						throws Exception {
					System.out.println(t._1 + "==" + t._2);
				}
			});
}

Source File: Join.java From SparkDemo with MIT License

5 votes

static void join(JavaSparkContext sc) {
    List<Tuple2<Integer, String>> products = new ArrayList<>();
    products.add(new Tuple2<>(1, "苹果"));
    products.add(new Tuple2<>(2, "梨"));
    products.add(new Tuple2<>(3, "香蕉"));
    products.add(new Tuple2<>(4, "石榴"));

    List<Tuple2<Integer, Integer>> counts = new ArrayList<>();
    counts.add(new Tuple2<>(1, 7));
    counts.add(new Tuple2<>(2, 3));
    counts.add(new Tuple2<>(3, 8));
    counts.add(new Tuple2<>(4, 3));
    counts.add(new Tuple2<>(5, 9));

    JavaPairRDD<Integer, String> productsRDD = sc.parallelizePairs(products);
    JavaPairRDD<Integer, Integer> countsRDD = sc.parallelizePairs(counts);

   /**
  	*  =================================================================================
    *   |            对<K, V>和<K, W>进行join操作，返回(K, (V, W))外连接函数为leftOuterJoin、rightOuterJoin和fullOuterJoin                    |
    *   |            For <K, V>, and <K, W> performs join operations and returns (K, (V, W)) the outer join functions are leftOuterJoin,  | 
    *   |            rightOuterJoin, and fullOuterJoin                                                                                                                                                     | 
  	*   =================================================================================
  	*/
    productsRDD.join(countsRDD)
            .foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception {
					   System.out.println(t._1 + "\t" + t._2());
				}
			});
}

Source File: Runner.java From envelope with Apache License 2.0

4 votes

/**
 * Run the Envelope pipeline as a Spark Streaming job.
 * @param steps The full configuration of the Envelope pipeline
 */
@SuppressWarnings("unchecked")
private void runStreaming(final Set<Step> steps) throws Exception {
  final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps);
  runBatch(independentNonStreamingSteps);

  Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps);
  for (final StreamingStep streamingStep : streamingSteps) {
    LOG.debug("Setting up streaming step: " + streamingStep.getName());

    JavaDStream stream = streamingStep.getStream();

    stream.foreachRDD(new VoidFunction<JavaRDD<?>>() {
      @Override
      public void call(JavaRDD<?> raw) throws Exception {
        // Some independent steps might be repeating steps that have been flagged for reload
        StepUtils.resetRepeatingSteps(steps);
        // This will run any batch steps (and dependents) that are not submitted
        runBatch(independentNonStreamingSteps);

        streamingStep.setData(streamingStep.translate(raw));
        streamingStep.writeData();
        streamingStep.setState(StepState.FINISHED);

        Set<Step> batchSteps = StepUtils.mergeLoadedSteps(steps, streamingStep, baseConfig);
        Set<Step> dependentSteps = StepUtils.getAllDependentSteps(streamingStep, batchSteps);
        batchSteps.add(streamingStep);
        batchSteps.addAll(streamingStep.loadNewBatchSteps());
        batchSteps.addAll(independentNonStreamingSteps);
        runBatch(batchSteps);

        StepUtils.resetSteps(dependentSteps);

        streamingStep.recordProgress(raw);
      }
    });

    LOG.debug("Finished setting up streaming step: " + streamingStep.getName());
  }

  JavaStreamingContext jsc = Contexts.getJavaStreamingContext();
  jsc.start();
  LOG.debug("Streaming context started");
  jsc.awaitTermination();
  LOG.debug("Streaming context terminated");
}

Source File: TranslationUtils.java From beam with Apache License 2.0

4 votes

public static <T> VoidFunction<T> emptyVoidFunction() {
  return t -> {
    // Empty implementation.
  };
}

Source File: StreamingIngestionFileSystemTextFileToDataframeApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
    private static final long serialVersionUID = -590010339928376829L;

    @Override
    public void call(JavaRDD<String> rdd) {
      JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
        private static final long serialVersionUID = 5167089361335095997L;

        @Override
        public Row call(String msg) {
          Row row = RowFactory.create(msg);
          return row;
        }
      });
      // Create Schema
      StructType schema = DataTypes.createStructType(
          new StructField[] { DataTypes.createStructField("Message",
              DataTypes.StringType, true) });

      // Get Spark 2.0 session
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
          .getConf());
      Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
      msgDataFrame.show();
    }
  });

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}

Source File: ReduceByKey.java From SparkDemo with MIT License

4 votes

/**
 * @category 统计文本单词个数
 * @param sc
 */
private static void reduceByKey(JavaSparkContext sc) {
	JavaRDD<String> lines = sc.textFile(Constant.LOCAL_FILE_PREX +"README.md");

	/**
	 *  ====================================================================================================== 
	 *   |                                                                     根据' '分词 扁平化输出Flatten output according to space word segmentation                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() {

		private static final long serialVersionUID = 1L;

		public Iterator<String> call(String line) throws Exception {
			List<String> words = Arrays.asList(line.split(" "));
			return words.iterator();
		}
	});

	/**
	 *  ====================================================================================================== 
	 *   |                                                                   将单词转换word:1的元组格式Converts the word to the tuple format of word:1                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaPairRDD<String, Integer> wordsCount = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {

		private static final long serialVersionUID = 1L;

		public Tuple2<String, Integer> call(String word) throws Exception {
			return new Tuple2<String, Integer>(word, 1);
		}
	});

	/**
	 *  ========================================================================================================= 
	 *   |                                                                根据元组Key(也就是单词)来分组Grouping according to the tuple Key (that is, the word)                                                                    | 
	 *   ========================================================================================================= 
	 */
	JavaPairRDD<String, Integer> resultRDD = wordsCount.reduceByKey(new Function2<Integer, Integer, Integer>() {

		private static final long serialVersionUID = 1L;

		public Integer call(Integer v1, Integer v2) throws Exception {
			return v1 + v2;
		}
	});

	resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {

		private static final long serialVersionUID = 1L;

		public void call(Tuple2<String, Integer> t) throws Exception {
			System.out.println(t._1 + "\t" + t._2());
		}
	});

	sc.close();
}

Source File: JavaHBaseMapGetPutExample.java From learning-hadoop with Apache License 2.0

4 votes

public static void main(String args[]) {
  if (args.length == 0) {
    System.out
        .println("JavaHBaseBulkGetExample  {master} {tableName}");
  }

  String master = args[0];
  String tableName = args[1];

  JavaSparkContext jsc = new JavaSparkContext(master,
      "JavaHBaseBulkGetExample");
  jsc.addJar("SparkHBase.jar");

  List<byte[]> list = new ArrayList<byte[]>();
  list.add(Bytes.toBytes("1"));
  list.add(Bytes.toBytes("2"));
  list.add(Bytes.toBytes("3"));
  list.add(Bytes.toBytes("4"));
  list.add(Bytes.toBytes("5"));

  //All Spark
  JavaRDD<byte[]> rdd = jsc.parallelize(list);

  //All HBase
  Configuration conf = HBaseConfiguration.create();
  conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
  conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

  //This is me
  JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

  //This is me
  hbaseContext.foreachPartition(rdd, null);
  
  hbaseContext.foreach(rdd, new VoidFunction<Tuple2<byte[], HConnection>>() {


    public void call(Tuple2<byte[], HConnection> t)
        throws Exception {
      HTableInterface table1 = t._2.getTable(Bytes.toBytes("Foo"));
      
      byte[] b = t._1;
      Result r = table1.get(new Get(b));
      if (r.getExists()) {
        table1.put(new Put(b));
      }
      
    }
  });
  
}

Source File: SparkStreamingFromFlumeToHBaseWindowingExample.java From SparkOnALog with Apache License 2.0

4 votes

public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	int windowInSeconds = Integer.parseInt(args[5]);
	int slideInSeconds = Integer.parseInt(args[5]);
	
	Duration batchInterval = new Duration(2000);
	Duration windowInterval = new Duration(windowInSeconds * 1000);
	Duration slideInterval = new Duration(slideInSeconds * 1000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			}, windowInterval, slideInterval);
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}

Source File: SparkStreamingFromFlumeToHBaseExample.java From SparkOnALog with Apache License 2.0

4 votes

public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseExample {master} {host} {port} {table} {columnFamily}");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	
	Duration batchInterval = new Duration(2000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKey(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			});
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}

Source File: JavaStreamingTestExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: JavaStreamingTestExample " +
      "<dataDir> <batchDuration> <numBatchesTimeout>");
      System.exit(1);
  }

  String dataDir = args[0];
  Duration batchDuration = Seconds.apply(Long.parseLong(args[1]));
  int numBatchesTimeout = Integer.parseInt(args[2]);

  SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample");
  JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration);

  ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString());

  // $example on$
  JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map(
    new Function<String, BinarySample>() {
      @Override
      public BinarySample call(String line) {
        String[] ts = line.split(",");
        boolean label = Boolean.parseBoolean(ts[0]);
        double value = Double.parseDouble(ts[1]);
        return new BinarySample(label, value);
      }
    });

  StreamingTest streamingTest = new StreamingTest()
    .setPeacePeriod(0)
    .setWindowSize(0)
    .setTestMethod("welch");

  JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data);
  out.print();
  // $example off$

  // Stop processing if test becomes significant or we time out
  timeoutCounter = numBatchesTimeout;

  out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() {
    @Override
    public void call(JavaRDD<StreamingTestResult> rdd) {
      timeoutCounter -= 1;

      boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() {
        @Override
        public Boolean call(StreamingTestResult v) {
          return v.pValue() < 0.05;
        }
      }).isEmpty();

      if (timeoutCounter <= 0 || anySignificant) {
        rdd.context().stop();
      }
    }
  });

  ssc.start();
  ssc.awaitTermination();
}

Source File: SampleConsumer.java From kafka-spark-consumer with Apache License 2.0

4 votes

@SuppressWarnings("deprecation")
private void run() {

  Properties props = new Properties();
  props.put("zookeeper.hosts", "zkhost");
  props.put("zookeeper.port", "2181");
  props.put("kafka.topic", "topicA,topicB,topicC");
  props.put("kafka.consumer.id", "kafka-consumer");
  // Optional Properties
  props.put("zookeeper.broker.path", "/brokers");
  props.put("zookeeper.consumer.path", "/consumers");
  props.put("consumer.forcefromstart", "false");
  props.put("max.poll.records", "10");
  props.put("consumer.fillfreqms", "500");
  props.put("consumer.backpressure.enabled", "true");
  //Kafka properties
  props.put("bootstrap.servers", "kafkahost-1:6667,"
          + "kafkahost-2:6667,"
          + "kafkahost-3:6667,"
          + "kafkahost-4:6667");
  props.put("security.protocol", "SSL");
  props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks");
  props.put("ssl.truststore.password", "test1234");

  SparkConf _sparkConf = new SparkConf();
  JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30));
  // Specify number of Receivers you need.
  int numberOfReceivers = 6;

  JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch(
      jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY());

  unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() {
    @Override
    public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception {
      //Start Application Logic
      rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() {
          @Override
          public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception {
              int countTopicA = 0;
              int countTopicB = 0;
              int countTopicC = 0;
              while(mmItr.hasNext()) {
                  MessageAndMetadata<byte[]> mm = mmItr.next();
                  if(mm.getTopic().equals("topicA")) {
                      countTopicA++;
                  }
                  else if (mm.getTopic().equals("topicB")) {
                      countTopicB++;
                  }
                  else if (mm.getTopic().equals("topicC")) {
                      countTopicC++;
                  }
              }
              System.out.println("topicA count " + countTopicA);
              System.out.println("topicB count " + countTopicB);
              System.out.println("topicC count " + countTopicC);
          }
      });
      System.out.println("RDD count " + rdd.count());
      //End Application Logic
      //commit offset
      System.out.println("Commiting Offset");
      ProcessedOffsetManager.persistsPartition(rdd, props);
    }
  });

  try {
    jsc.start();
    jsc.awaitTermination();
  }catch (Exception ex ) {
    jsc.ssc().sc().cancelAllJobs();
    jsc.stop(true, false);
    System.exit(-1);
  }
}

org.apache.spark.api.java.function.VoidFunction Java Examples