org.apache.spark.api.java.function.VoidFunction Java Examples
The following examples show how to use
org.apache.spark.api.java.function.VoidFunction.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Accumulator.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local[4]").appName("AttackFind").getOrCreate(); //初始化sparkContext JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext()); //日志输出级别 javaSparkContext.setLogLevel("ERROR"); //创建RDD JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache(); AttackAccumulator attackAccumulator = new AttackAccumulator(); //注册累加器 javaSparkContext.sc().register(attackAccumulator, "attack_count"); //生成一个随机数作为value JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> { Integer random = new Random().nextInt(10); return new Tuple2<>(s, s + ":" + random); }); javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> { attackAccumulator.add(tuple2._2); }); System.out.println(attackAccumulator.value()); }
Example #2
Source File: Intersection.java From SparkDemo with MIT License | 6 votes |
static void intersection(JavaSparkContext sc) { List<String> datas1 = Arrays.asList("张三", "李四", "tom"); List<String> datas2 = Arrays.asList("tom", "gim"); /** * ===================================== * | 返回两个RDD的交集 | * | Returns the intersection of two RDD | | * ===================================== */ JavaRDD<String> intersectionRDD = sc.parallelize(datas1).intersection(sc.parallelize(datas2)); intersectionRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); }
Example #3
Source File: SampleAndTake.java From SparkDemo with MIT License | 6 votes |
static void sample(JavaSparkContext sc) { List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8); JavaRDD<Integer> dataRDD = sc.parallelize(datas); /** * ====================================================================================================== * | 随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子 | * | The random sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times, | * | and false indicates no return. Fraction represents the sampling proportion;seed is the random number seed | | * ====================================================================================================== */ JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis()); // TODO dataRDD.takeSample(false, 3); // TODO dataRDD.take(3) sampleRDD.foreach(new VoidFunction<Integer>() { @Override public void call(Integer t) throws Exception { System.out.println(t); } }); sc.close(); }
Example #4
Source File: Distinct.java From SparkDemo with MIT License | 6 votes |
private static void distinct(JavaSparkContext sc) { List<String> datas = Arrays.asList("张三", "李四", "tom", "张三"); /** * =================================== * | 去重--包含shuffle操作 | * | Remove weights, including shuffle operations | | * =================================== */ JavaRDD<String> distinctRDD = sc.parallelize(datas).distinct(); distinctRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); }
Example #5
Source File: Union.java From SparkDemo with MIT License | 6 votes |
static void union(JavaSparkContext sc ) { List<String> datas1 = Arrays.asList("张三", "李四"); List<String> datas2 = Arrays.asList("tom", "gim"); JavaRDD<String> data1RDD = sc.parallelize(datas1); JavaRDD<String> data2RDD = sc.parallelize(datas2); /** * ==================================================================== * | 合并两个RDD,不去重,要求两个RDD中的元素类型一致 | * | Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD | | * ==================================================================== */ JavaRDD<String> unionRDD = data1RDD .union(data2RDD); unionRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); sc.close(); }
Example #6
Source File: FlatMap.java From SparkDemo with MIT License | 6 votes |
private static void flatMap(JavaSparkContext sc) { List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript"); JavaRDD<String> rddData = sc.parallelize(data); FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { List<String> list = Arrays.asList(s.split(",")); return list.iterator(); } }; JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction); flatMapData.foreach(new VoidFunction<String>() { @Override public void call(String v) throws Exception { System.out.println(v); } }); sc.close(); }
Example #7
Source File: InputFormatTest.java From HadoopCV with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]"); JavaSparkContext sc = new JavaSparkContext(conf); Configuration hc = new org.apache.hadoop.conf.Configuration(); JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc); video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() { @Override public void call(Tuple2<Text, HBMat> tuple) throws Exception { HBMat image = (HBMat)tuple._2; System.out.print(image.getBmat().dump()); } }); System.out.print(video.count()); }
Example #8
Source File: Cartesian.java From SparkDemo with MIT License | 6 votes |
private static void cartesian(JavaSparkContext sc) { List<String> names = Arrays.asList("张三", "李四", "王五"); List<Integer> scores = Arrays.asList(60, 70, 80); JavaRDD<String> namesRDD = sc.parallelize(names); JavaRDD<Integer> scoreRDD = sc.parallelize(scores); /** * ===================================== * | 两个RDD进行笛卡尔积合并 | * | The two RDD are Cartesian product merging | | * ===================================== */ JavaPairRDD<String, Integer> cartesianRDD = namesRDD.cartesian(scoreRDD); cartesianRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() { public void call(Tuple2<String, Integer> t) throws Exception { System.out.println(t._1 + "\t" + t._2()); } }); }
Example #9
Source File: Filter.java From SparkDemo with MIT License | 6 votes |
private static void filter(JavaSparkContext sc) { List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8); JavaRDD<Integer> rddData = sc.parallelize(datas); JavaRDD<Integer> filterRDD = rddData.filter( // jdk1.8 // v1 -> v1 >= 3 new Function<Integer, Boolean>() { public Boolean call(Integer v) throws Exception { // 过滤小于4的数 return v >= 4; } }); filterRDD.foreach( // jdk1.8 // v -> System.out.println(v) new VoidFunction<Integer>() { @Override public void call(Integer integer) throws Exception { System.out.println(integer); } }); sc.close(); }
Example #10
Source File: BroadCastParam.java From sparkResearch with Apache License 2.0 | 6 votes |
/** * 广播变量测试 * @param args */ public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local[4]").appName("AttackFind").getOrCreate(); //初始化sparkContext JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext()); //在这里假定一份广播变量 //因为我们之前说过,广播变量只可读 final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK"); //设置广播变量,把broadcast广播出去 final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList); //定义数据 JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000"))); JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2)); resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println); }
Example #11
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) { partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() { @Override public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception { List<Tuple2<String, Iterable<Long>>> poList = po.collect(); doPersists(poList, props); } }); }
Example #12
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) { ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag = ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag(); JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset = new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag); jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() { @Override public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception { List<Tuple2<String, Iterable<Long>>> poList = po.collect(); doPersists(poList, props); } }); }
Example #13
Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static void main(String[] args) throws Exception { final String dbUrl = args[0]; final String hostname = args[1]; final String port = args[2]; final String inTargetSchema = args[3]; final String inTargetTable = args[4]; SparkConf conf = new SparkConf(); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500)); JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port)); SparkSession spark = SparkSession.builder().getOrCreate(); // Create a SplicemachineContext based on the provided DB connection SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl); // Set target tablename and schemaname final String table = inTargetSchema + "." + inTargetTable; stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> { JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s)); Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table)); splicemachineContext.insert(df, table); }); ssc.start(); ssc.awaitTermination(); }
Example #14
Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static void main(String[] args) throws Exception { final String dbUrl = args[0]; final String hostname = args[1]; final String port = args[2]; final String inTargetSchema = args[3]; final String inTargetTable = args[4]; SparkConf conf = new SparkConf(); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500)); SpliceSpark.setContext(ssc.sparkContext()); SparkSession spark = SpliceSpark.getSessionUnsafe(); JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port)); // Create a SplicemachineContext based on the provided DB connection SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl); // Set target tablename and schemaname final String table = inTargetSchema + "." + inTargetTable; stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> { JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s)); Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table)); splicemachineContext.insert(df, table); }); ssc.start(); ssc.awaitTermination(); }
Example #15
Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
private void encryptedColumnCalc(JavaPairDStream<Long,BigInteger> encRowRDD) { // Multiply the column values by colNum: emit <colNum, finalColVal> JavaPairDStream<Long,BigInteger> encColRDD; if (colMultReduceByKey) { encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions); } else { encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars)); } // Update the output name, by batch number bVars.setOutput(outputFile + "_" + accum.numBatchesGetValue()); // Form and write the response object encColRDD.repartition(1).foreachRDD((VoidFunction<JavaPairRDD<Long,BigInteger>>) rdd -> { rdd.foreachPartition(new FinalResponseFunction(accum, bVars)); int maxBatchesVar = bVars.getMaxBatches(); if (maxBatchesVar != -1 && accum.numBatchesGetValue() == maxBatchesVar) { logger.info("num batches = maxBatches = " + maxBatchesVar + "; shutting down"); System.exit(0); } }); }
Example #16
Source File: Functions.java From spark-streaming-direct-kafka with Apache License 2.0 | 5 votes |
public static <T> VoidFunction<T> noOp() { return new VoidFunction<T>() { @Override public void call(T t) { // do nothing } }; }
Example #17
Source File: AccumulatorValue.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { JavaSparkContext sc = SparkUtils.getLocalSparkContext(AccumulatorValue.class); // 创建累加器 final Accumulator<Integer> accumulator = sc.accumulator(0, "My Accumulator"); List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> listRDD = sc.parallelize(list); listRDD.foreach(new VoidFunction<Integer>() { @Override public void call(Integer n) throws Exception { accumulator.add(n); // 不能读取,会报异常 cannot read, you will report an exception // System.out.println(accumulator.value()); } }); // 只能在Driver读取 System.out.println(accumulator.value()); try { Thread.sleep(5000*5000*5000); // http://192.168.68.1:4040 } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } sc.close(); }
Example #18
Source File: JDBCDataSource.java From SparkDemo with MIT License | 5 votes |
/** * @category 将RDD的数据存储到Mysql数据库中 * @param sqlContext * @param options */ private static void saveToMysql( SQLContext sqlContext, Map<String, String> options) { options.put("url", "jdbc:mysql://192.168.68.1:3306/tourismdb"); options.put("dbtable", "t_user"); Dataset<Row> dataset = sqlContext.read().format("jdbc").options(options).load(); dataset.javaRDD().foreach(new VoidFunction<Row>() { @Override public void call(Row row) throws Exception { String sql = "insert into t_user( name, password, phone, email,type,status,del) values(" + "'"+ row.getString(1) + "'," + "'"+ row.getString(2) + "'," + "'"+ row.getString(3) + "'," + "'"+ row.getString(4) + "'," + row.getInt(5)+ "," + row.getInt(6)+ "," + row.getInt(7)+ ")"; System.out.println(sql); Class.forName("com.mysql.jdbc.Driver"); Connection conn = null; Statement statement = null; try { conn = DriverManager.getConnection("jdbc:mysql://192.168.68.129:3306/sparkdemo","root","666666"); statement = conn.createStatement(); statement.executeUpdate(sql); } catch (Exception e) { e.printStackTrace(); } finally { if(statement!=null){ statement.close(); } if (conn!=null) { conn.close(); } } } }); }
Example #19
Source File: JDBCDataSource.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { // SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local"); JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class); SQLContext sqlContext = new SQLContext(sc); Map<String, String> options = new HashMap<String, String>(); options.put("url", "jdbc:mysql://192.168.2.129:3306/hive"); options.put("dbtable", "t_user"); options.put("user", "root"); options.put("password", "666666"); // 加载jdbc数据配置信息 并不会立即连接数据库 Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load(); // options.put("dbtable", "tb_item"); // DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load(); // 读取jdbc表数据 dataset1.javaRDD().foreach(new VoidFunction<Row>() { @Override public void call(Row row) throws Exception { System.out.println(row); } }); // 将RDD数据存储到MYSQL中 saveToMysql( sqlContext, options); sc.close(); }
Example #20
Source File: Cogroup.java From SparkDemo with MIT License | 5 votes |
private static void cogroup(JavaSparkContext sc) { List<Tuple2<Integer, String>> datas1 = new ArrayList<>(); datas1.add(new Tuple2<>(1, "苹果")); datas1.add(new Tuple2<>(2, "梨")); datas1.add(new Tuple2<>(3, "香蕉")); datas1.add(new Tuple2<>(4, "石榴")); List<Tuple2<Integer, Integer>> datas2 = new ArrayList<>(); datas2.add(new Tuple2<>(1, 7)); datas2.add(new Tuple2<>(2, 3)); datas2.add(new Tuple2<>(3, 8)); datas2.add(new Tuple2<>(4, 3)); List<Tuple2<Integer, String>> datas3 = new ArrayList<>(); datas3.add(new Tuple2<>(1, "7")); datas3.add(new Tuple2<>(2, "3")); datas3.add(new Tuple2<>(3, "8")); datas3.add(new Tuple2<>(4, "3")); datas3.add(new Tuple2<>(4, "4")); datas3.add(new Tuple2<>(4, "5")); datas3.add(new Tuple2<>(4, "6")); /** * ===================================================== ========================= * | Cogroup: groups the elements in the same key in each RDD into a collection of KV elements in each RDD. | * | Unlike reduceByKey, the elements of the same key are merged in the two RDD. | * =============================================================================== */ sc.parallelizePairs(datas1).cogroup(sc.parallelizePairs(datas2), sc.parallelizePairs(datas3)).foreach( new VoidFunction<Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>>>() { @Override public void call(Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>> t) throws Exception { System.out.println(t._1 + "==" + t._2); } }); }
Example #21
Source File: Join.java From SparkDemo with MIT License | 5 votes |
static void join(JavaSparkContext sc) { List<Tuple2<Integer, String>> products = new ArrayList<>(); products.add(new Tuple2<>(1, "苹果")); products.add(new Tuple2<>(2, "梨")); products.add(new Tuple2<>(3, "香蕉")); products.add(new Tuple2<>(4, "石榴")); List<Tuple2<Integer, Integer>> counts = new ArrayList<>(); counts.add(new Tuple2<>(1, 7)); counts.add(new Tuple2<>(2, 3)); counts.add(new Tuple2<>(3, 8)); counts.add(new Tuple2<>(4, 3)); counts.add(new Tuple2<>(5, 9)); JavaPairRDD<Integer, String> productsRDD = sc.parallelizePairs(products); JavaPairRDD<Integer, Integer> countsRDD = sc.parallelizePairs(counts); /** * ================================================================================= * | 对<K, V>和<K, W>进行join操作,返回(K, (V, W))外连接函数为leftOuterJoin、rightOuterJoin和fullOuterJoin | * | For <K, V>, and <K, W> performs join operations and returns (K, (V, W)) the outer join functions are leftOuterJoin, | * | rightOuterJoin, and fullOuterJoin | * ================================================================================= */ productsRDD.join(countsRDD) .foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() { @Override public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception { System.out.println(t._1 + "\t" + t._2()); } }); }
Example #22
Source File: Runner.java From envelope with Apache License 2.0 | 4 votes |
/** * Run the Envelope pipeline as a Spark Streaming job. * @param steps The full configuration of the Envelope pipeline */ @SuppressWarnings("unchecked") private void runStreaming(final Set<Step> steps) throws Exception { final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps); runBatch(independentNonStreamingSteps); Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps); for (final StreamingStep streamingStep : streamingSteps) { LOG.debug("Setting up streaming step: " + streamingStep.getName()); JavaDStream stream = streamingStep.getStream(); stream.foreachRDD(new VoidFunction<JavaRDD<?>>() { @Override public void call(JavaRDD<?> raw) throws Exception { // Some independent steps might be repeating steps that have been flagged for reload StepUtils.resetRepeatingSteps(steps); // This will run any batch steps (and dependents) that are not submitted runBatch(independentNonStreamingSteps); streamingStep.setData(streamingStep.translate(raw)); streamingStep.writeData(); streamingStep.setState(StepState.FINISHED); Set<Step> batchSteps = StepUtils.mergeLoadedSteps(steps, streamingStep, baseConfig); Set<Step> dependentSteps = StepUtils.getAllDependentSteps(streamingStep, batchSteps); batchSteps.add(streamingStep); batchSteps.addAll(streamingStep.loadNewBatchSteps()); batchSteps.addAll(independentNonStreamingSteps); runBatch(batchSteps); StepUtils.resetSteps(dependentSteps); streamingStep.recordProgress(raw); } }); LOG.debug("Finished setting up streaming step: " + streamingStep.getName()); } JavaStreamingContext jsc = Contexts.getJavaStreamingContext(); jsc.start(); LOG.debug("Streaming context started"); jsc.awaitTermination(); LOG.debug("Streaming context terminated"); }
Example #23
Source File: TranslationUtils.java From beam with Apache License 2.0 | 4 votes |
public static <T> VoidFunction<T> emptyVoidFunction() { return t -> { // Empty implementation. }; }
Example #24
Source File: StreamingIngestionFileSystemTextFileToDataframeApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
private void start() { // Create a local StreamingContext with two working thread and batch // interval of // 1 second SparkConf conf = new SparkConf().setMaster("local[2]").setAppName( "Streaming Ingestion File System Text File to Dataframe"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations .seconds(5)); JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils .getInputDirectory()); msgDataStream.print(); // Create JavaRDD<Row> msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() { private static final long serialVersionUID = -590010339928376829L; @Override public void call(JavaRDD<String> rdd) { JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() { private static final long serialVersionUID = 5167089361335095997L; @Override public Row call(String msg) { Row row = RowFactory.create(msg); return row; } }); // Create Schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) }); // Get Spark 2.0 session SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context() .getConf()); Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema); msgDataFrame.show(); } }); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example #25
Source File: ReduceByKey.java From SparkDemo with MIT License | 4 votes |
/** * @category 统计文本单词个数 * @param sc */ private static void reduceByKey(JavaSparkContext sc) { JavaRDD<String> lines = sc.textFile(Constant.LOCAL_FILE_PREX +"README.md"); /** * ====================================================================================================== * | 根据' '分词 扁平化输出Flatten output according to space word segmentation | * ====================================================================================================== */ JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() { private static final long serialVersionUID = 1L; public Iterator<String> call(String line) throws Exception { List<String> words = Arrays.asList(line.split(" ")); return words.iterator(); } }); /** * ====================================================================================================== * | 将单词转换word:1的元组格式Converts the word to the tuple format of word:1 | * ====================================================================================================== */ JavaPairRDD<String, Integer> wordsCount = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); /** * ========================================================================================================= * | 根据元组Key(也就是单词)来分组Grouping according to the tuple Key (that is, the word) | * ========================================================================================================= */ JavaPairRDD<String, Integer> resultRDD = wordsCount.reduceByKey(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() { private static final long serialVersionUID = 1L; public void call(Tuple2<String, Integer> t) throws Exception { System.out.println(t._1 + "\t" + t._2()); } }); sc.close(); }
Example #26
Source File: JavaHBaseMapGetPutExample.java From learning-hadoop with Apache License 2.0 | 4 votes |
public static void main(String args[]) { if (args.length == 0) { System.out .println("JavaHBaseBulkGetExample {master} {tableName}"); } String master = args[0]; String tableName = args[1]; JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseBulkGetExample"); jsc.addJar("SparkHBase.jar"); List<byte[]> list = new ArrayList<byte[]>(); list.add(Bytes.toBytes("1")); list.add(Bytes.toBytes("2")); list.add(Bytes.toBytes("3")); list.add(Bytes.toBytes("4")); list.add(Bytes.toBytes("5")); //All Spark JavaRDD<byte[]> rdd = jsc.parallelize(list); //All HBase Configuration conf = HBaseConfiguration.create(); conf.addResource(new Path("/etc/hbase/conf/core-site.xml")); conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml")); //This is me JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); //This is me hbaseContext.foreachPartition(rdd, null); hbaseContext.foreach(rdd, new VoidFunction<Tuple2<byte[], HConnection>>() { public void call(Tuple2<byte[], HConnection> t) throws Exception { HTableInterface table1 = t._2.getTable(Bytes.toBytes("Foo")); byte[] b = t._1; Result r = table1.get(new Get(b)); if (r.getExists()) { table1.put(new Put(b)); } } }); }
Example #27
Source File: SparkStreamingFromFlumeToHBaseWindowingExample.java From SparkOnALog with Apache License 2.0 | 4 votes |
public static void main(String[] args) { if (args.length == 0) { System.err .println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds"); System.exit(1); } String master = args[0]; String host = args[1]; int port = Integer.parseInt(args[2]); String tableName = args[3]; String columnFamily = args[4]; int windowInSeconds = Integer.parseInt(args[5]); int slideInSeconds = Integer.parseInt(args[5]); Duration batchInterval = new Duration(2000); Duration windowInterval = new Duration(windowInSeconds * 1000); Duration slideInterval = new Duration(slideInSeconds * 1000); JavaStreamingContext sc = new JavaStreamingContext(master, "FlumeEventCount", batchInterval, System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar"); final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName); final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily); //JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port); JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port); JavaPairDStream<String, Integer> lastCounts = flumeStream .flatMap(new FlatMapFunction<SparkFlumeEvent, String>() { @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String bodyString = new String(event.event().getBody() .array(), "UTF-8"); return Arrays.asList(bodyString.split(" ")); } }).map(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String str) throws Exception { return new Tuple2(str, 1); } }).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer x, Integer y) throws Exception { // TODO Auto-generated method stub return x.intValue() + y.intValue(); } }, windowInterval, slideInterval); lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> values, Time time) throws Exception { values.foreach(new VoidFunction<Tuple2<String, Integer>> () { @Override public void call(Tuple2<String, Integer> tuple) throws Exception { HBaseCounterIncrementor incrementor = HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value()); incrementor.incerment("Counter", tuple._1(), tuple._2()); System.out.println("Counter:" + tuple._1() + "," + tuple._2()); }} ); return null; }}); sc.start(); }
Example #28
Source File: SparkStreamingFromFlumeToHBaseExample.java From SparkOnALog with Apache License 2.0 | 4 votes |
public static void main(String[] args) { if (args.length == 0) { System.err .println("Usage: SparkStreamingFromFlumeToHBaseExample {master} {host} {port} {table} {columnFamily}"); System.exit(1); } String master = args[0]; String host = args[1]; int port = Integer.parseInt(args[2]); String tableName = args[3]; String columnFamily = args[4]; Duration batchInterval = new Duration(2000); JavaStreamingContext sc = new JavaStreamingContext(master, "FlumeEventCount", batchInterval, System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar"); final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName); final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily); //JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port); JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port); JavaPairDStream<String, Integer> lastCounts = flumeStream .flatMap(new FlatMapFunction<SparkFlumeEvent, String>() { @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String bodyString = new String(event.event().getBody() .array(), "UTF-8"); return Arrays.asList(bodyString.split(" ")); } }).map(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String str) throws Exception { return new Tuple2(str, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer x, Integer y) throws Exception { // TODO Auto-generated method stub return x.intValue() + y.intValue(); } }); lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> values, Time time) throws Exception { values.foreach(new VoidFunction<Tuple2<String, Integer>> () { @Override public void call(Tuple2<String, Integer> tuple) throws Exception { HBaseCounterIncrementor incrementor = HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value()); incrementor.incerment("Counter", tuple._1(), tuple._2()); System.out.println("Counter:" + tuple._1() + "," + tuple._2()); }} ); return null; }}); sc.start(); }
Example #29
Source File: JavaStreamingTestExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: JavaStreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>"); System.exit(1); } String dataDir = args[0]; Duration batchDuration = Seconds.apply(Long.parseLong(args[1])); int numBatchesTimeout = Integer.parseInt(args[2]); SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample"); JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration); ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString()); // $example on$ JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map( new Function<String, BinarySample>() { @Override public BinarySample call(String line) { String[] ts = line.split(","); boolean label = Boolean.parseBoolean(ts[0]); double value = Double.parseDouble(ts[1]); return new BinarySample(label, value); } }); StreamingTest streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch"); JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data); out.print(); // $example off$ // Stop processing if test becomes significant or we time out timeoutCounter = numBatchesTimeout; out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() { @Override public void call(JavaRDD<StreamingTestResult> rdd) { timeoutCounter -= 1; boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() { @Override public Boolean call(StreamingTestResult v) { return v.pValue() < 0.05; } }).isEmpty(); if (timeoutCounter <= 0 || anySignificant) { rdd.context().stop(); } } }); ssc.start(); ssc.awaitTermination(); }
Example #30
Source File: SampleConsumer.java From kafka-spark-consumer with Apache License 2.0 | 4 votes |
@SuppressWarnings("deprecation") private void run() { Properties props = new Properties(); props.put("zookeeper.hosts", "zkhost"); props.put("zookeeper.port", "2181"); props.put("kafka.topic", "topicA,topicB,topicC"); props.put("kafka.consumer.id", "kafka-consumer"); // Optional Properties props.put("zookeeper.broker.path", "/brokers"); props.put("zookeeper.consumer.path", "/consumers"); props.put("consumer.forcefromstart", "false"); props.put("max.poll.records", "10"); props.put("consumer.fillfreqms", "500"); props.put("consumer.backpressure.enabled", "true"); //Kafka properties props.put("bootstrap.servers", "kafkahost-1:6667," + "kafkahost-2:6667," + "kafkahost-3:6667," + "kafkahost-4:6667"); props.put("security.protocol", "SSL"); props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks"); props.put("ssl.truststore.password", "test1234"); SparkConf _sparkConf = new SparkConf(); JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30)); // Specify number of Receivers you need. int numberOfReceivers = 6; JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch( jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY()); unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() { @Override public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception { //Start Application Logic rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() { @Override public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception { int countTopicA = 0; int countTopicB = 0; int countTopicC = 0; while(mmItr.hasNext()) { MessageAndMetadata<byte[]> mm = mmItr.next(); if(mm.getTopic().equals("topicA")) { countTopicA++; } else if (mm.getTopic().equals("topicB")) { countTopicB++; } else if (mm.getTopic().equals("topicC")) { countTopicC++; } } System.out.println("topicA count " + countTopicA); System.out.println("topicB count " + countTopicB); System.out.println("topicC count " + countTopicC); } }); System.out.println("RDD count " + rdd.count()); //End Application Logic //commit offset System.out.println("Commiting Offset"); ProcessedOffsetManager.persistsPartition(rdd, props); } }); try { jsc.start(); jsc.awaitTermination(); }catch (Exception ex ) { jsc.ssc().sc().cancelAllJobs(); jsc.stop(true, false); System.exit(-1); } }