org.apache.spark.api.java.function.VoidFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.VoidFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Accumulator.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //日志输出级别
    javaSparkContext.setLogLevel("ERROR");
    //创建RDD
    JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache();

    AttackAccumulator attackAccumulator = new AttackAccumulator();
    //注册累加器
    javaSparkContext.sc().register(attackAccumulator, "attack_count");
    //生成一个随机数作为value
    JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> {
        Integer random = new Random().nextInt(10);
        return new Tuple2<>(s, s + ":" + random);
    });

    javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> {
        attackAccumulator.add(tuple2._2);
    });
    System.out.println(attackAccumulator.value());
}
 
Example #2
Source File: Intersection.java    From SparkDemo with MIT License 6 votes vote down vote up
static void intersection(JavaSparkContext sc) {
	List<String> datas1 = Arrays.asList("张三", "李四", "tom");
	List<String> datas2 = Arrays.asList("tom", "gim");

	/**
	 *  =====================================
	 *   |             返回两个RDD的交集                                                   |
	 *   |             Returns the intersection of two RDD                    |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
	JavaRDD<String> intersectionRDD = sc.parallelize(datas1).intersection(sc.parallelize(datas2));

	intersectionRDD.foreach(new VoidFunction<String>() {

		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

}
 
Example #3
Source File: SampleAndTake.java    From SparkDemo with MIT License 6 votes vote down vote up
static void sample(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> dataRDD = sc.parallelize(datas);
	
	/**
	 *  ====================================================================================================== 
	 *   |                   随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子                       |
	 *   |                   The random  sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times,  |
	 *   |                   and false indicates no return.  Fraction represents the sampling proportion;seed is the random number seed                                                               |                                                                                                                                                                                                                                           | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis());
	
	// TODO dataRDD.takeSample(false, 3);
	// TODO dataRDD.take(3)

	sampleRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer t) throws Exception {
			System.out.println(t);
		}
	});

	sc.close();
}
 
Example #4
Source File: Distinct.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void distinct(JavaSparkContext sc) {
	List<String> datas = Arrays.asList("张三", "李四", "tom", "张三");

	 /**
	 *  ===================================
	 *   |      去重--包含shuffle操作                                                 |
	 *   |      Remove weights, including shuffle operations    |                                                                                                                                                                                                                                    | 
	 *   ===================================
	 */
	JavaRDD<String> distinctRDD = sc.parallelize(datas).distinct();
	
	distinctRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});
}
 
Example #5
Source File: Union.java    From SparkDemo with MIT License 6 votes vote down vote up
static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD,不去重,要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}
 
Example #6
Source File: FlatMap.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}
 
Example #7
Source File: InputFormatTest.java    From HadoopCV with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]");
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	Configuration hc = new org.apache.hadoop.conf.Configuration();
	JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc);
	
	video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() {	
		@Override
		public void call(Tuple2<Text, HBMat> tuple) throws Exception {
			HBMat image = (HBMat)tuple._2;
			System.out.print(image.getBmat().dump());
		}
	});
	
	System.out.print(video.count());
}
 
Example #8
Source File: Cartesian.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void cartesian(JavaSparkContext sc) {
    List<String> names = Arrays.asList("张三", "李四", "王五");
    List<Integer> scores = Arrays.asList(60, 70, 80);

    JavaRDD<String> namesRDD = sc.parallelize(names);
    JavaRDD<Integer> scoreRDD = sc.parallelize(scores);

    /**
	 *  =====================================
	 *   |             两个RDD进行笛卡尔积合并                                        |
	 *   |             The two RDD are Cartesian product merging     |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
    JavaPairRDD<String, Integer> cartesianRDD = namesRDD.cartesian(scoreRDD);
    
    cartesianRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
        public void call(Tuple2<String, Integer> t) throws Exception {
            System.out.println(t._1 + "\t" + t._2());
        }
    });
}
 
Example #9
Source File: Filter.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void filter(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> rddData = sc.parallelize(datas);
	JavaRDD<Integer> filterRDD = rddData.filter(
			// jdk1.8
			// v1 -> v1 >= 3
			new Function<Integer, Boolean>() {
				public Boolean call(Integer v) throws Exception {
					// 过滤小于4的数
					return v >= 4;
				}
			});

	filterRDD.foreach(
			// jdk1.8
			// v -> System.out.println(v)
			new VoidFunction<Integer>() {
				@Override
				public void call(Integer integer) throws Exception {
					System.out.println(integer);
				}
			});
	sc.close();
}
 
Example #10
Source File: BroadCastParam.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example #11
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) {
  partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() {
    @Override
    public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example #12
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag =
      ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset =
      new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example #13
Source File: ReaderWriterExample.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example #14
Source File: ReaderWriterExample.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));
        SpliceSpark.setContext(ssc.sparkContext());

        SparkSession spark = SpliceSpark.getSessionUnsafe();

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example #15
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
private void encryptedColumnCalc(JavaPairDStream<Long,BigInteger> encRowRDD)
{
  // Multiply the column values by colNum: emit <colNum, finalColVal>
  JavaPairDStream<Long,BigInteger> encColRDD;
  if (colMultReduceByKey)
  {
    encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions);
  }
  else
  {
    encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars));
  }

  // Update the output name, by batch number
  bVars.setOutput(outputFile + "_" + accum.numBatchesGetValue());

  // Form and write the response object
  encColRDD.repartition(1).foreachRDD((VoidFunction<JavaPairRDD<Long,BigInteger>>) rdd -> {
    rdd.foreachPartition(new FinalResponseFunction(accum, bVars));

    int maxBatchesVar = bVars.getMaxBatches();
    if (maxBatchesVar != -1 && accum.numBatchesGetValue() == maxBatchesVar)
    {
      logger.info("num batches = maxBatches = " + maxBatchesVar + "; shutting down");
      System.exit(0);
    }

  });
}
 
Example #16
Source File: Functions.java    From spark-streaming-direct-kafka with Apache License 2.0 5 votes vote down vote up
public static <T> VoidFunction<T> noOp() {
    return new VoidFunction<T>() {
        @Override
        public void call(T t) {
            // do nothing
        }
    };
}
 
Example #17
Source File: AccumulatorValue.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(AccumulatorValue.class);

	// 创建累加器
	final Accumulator<Integer> accumulator = sc.accumulator(0, "My Accumulator");

	List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
	JavaRDD<Integer> listRDD = sc.parallelize(list);

	listRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer n) throws Exception {
			accumulator.add(n);
			// 不能读取,会报异常 cannot read, you will report an exception
			// System.out.println(accumulator.value());
		}
	});

	// 只能在Driver读取
	System.out.println(accumulator.value());
	
	try {
		Thread.sleep(5000*5000*5000);
		// http://192.168.68.1:4040
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	sc.close();
}
 
Example #18
Source File: JDBCDataSource.java    From SparkDemo with MIT License 5 votes vote down vote up
/**
 * @category 将RDD的数据存储到Mysql数据库中
 * @param sqlContext
 * @param options
 */
private static void saveToMysql( SQLContext sqlContext, Map<String, String> options) {
	options.put("url", "jdbc:mysql://192.168.68.1:3306/tourismdb");
	options.put("dbtable", "t_user");
	Dataset<Row> dataset = sqlContext.read().format("jdbc").options(options).load();

	dataset.javaRDD().foreach(new VoidFunction<Row>() {
		@Override
		public void call(Row row) throws Exception {
			String sql = "insert into t_user( name, password, phone, email,type,status,del) values("
					+ "'"+ row.getString(1) + "'," 
					+ "'"+ row.getString(2) + "'," 
					+ "'"+ row.getString(3) + "'," 
					+ "'"+ row.getString(4) + "'," 
					+ row.getInt(5)+ ","
					+ row.getInt(6)+ ","
					+ row.getInt(7)+ ")";
			System.out.println(sql);
			Class.forName("com.mysql.jdbc.Driver");
			Connection conn = null;
			Statement statement = null;
			try {
				conn = DriverManager.getConnection("jdbc:mysql://192.168.68.129:3306/sparkdemo","root","666666");
				statement = conn.createStatement();
				statement.executeUpdate(sql);
			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				if(statement!=null){
					statement.close();
				}
				if (conn!=null) {
					conn.close();
				}
			}
		}
	});
}
 
Example #19
Source File: JDBCDataSource.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
//		SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
		JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class);
		SQLContext sqlContext = new SQLContext(sc);

		Map<String, String> options = new HashMap<String, String>();
		options.put("url", "jdbc:mysql://192.168.2.129:3306/hive");
		options.put("dbtable", "t_user");
		options.put("user", "root");
		options.put("password", "666666");

		// 加载jdbc数据配置信息 并不会立即连接数据库
		Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load();

		//		options.put("dbtable", "tb_item");
		//		DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load();

		// 读取jdbc表数据
		dataset1.javaRDD().foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row row) throws Exception {
				System.out.println(row);
			}
		});


		// 将RDD数据存储到MYSQL中
		saveToMysql( sqlContext, options);

		sc.close();
	}
 
Example #20
Source File: Cogroup.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void cogroup(JavaSparkContext sc) {
	List<Tuple2<Integer, String>> datas1 = new ArrayList<>();
	datas1.add(new Tuple2<>(1, "苹果"));
	datas1.add(new Tuple2<>(2, "梨"));
	datas1.add(new Tuple2<>(3, "香蕉"));
	datas1.add(new Tuple2<>(4, "石榴"));

	List<Tuple2<Integer, Integer>> datas2 = new ArrayList<>();
	datas2.add(new Tuple2<>(1, 7));
	datas2.add(new Tuple2<>(2, 3));
	datas2.add(new Tuple2<>(3, 8));
	datas2.add(new Tuple2<>(4, 3));

	List<Tuple2<Integer, String>> datas3 = new ArrayList<>();
	datas3.add(new Tuple2<>(1, "7"));
	datas3.add(new Tuple2<>(2, "3"));
	datas3.add(new Tuple2<>(3, "8"));
	datas3.add(new Tuple2<>(4, "3"));
	datas3.add(new Tuple2<>(4, "4"));
	datas3.add(new Tuple2<>(4, "5"));
	datas3.add(new Tuple2<>(4, "6"));

	/**
	 *   ===================================================== =========================
	 *   |    Cogroup: groups the elements in the same key in each RDD into a collection of KV elements in each RDD.                   |
	 *   |    Unlike reduceByKey, the elements of the same key are merged in the two RDD.                                                                   | 
	 *   ===============================================================================
	 */
	sc.parallelizePairs(datas1).cogroup(sc.parallelizePairs(datas2), sc.parallelizePairs(datas3)).foreach(
			new VoidFunction<Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>> t)
						throws Exception {
					System.out.println(t._1 + "==" + t._2);
				}
			});
}
 
Example #21
Source File: Join.java    From SparkDemo with MIT License 5 votes vote down vote up
static void join(JavaSparkContext sc) {
    List<Tuple2<Integer, String>> products = new ArrayList<>();
    products.add(new Tuple2<>(1, "苹果"));
    products.add(new Tuple2<>(2, "梨"));
    products.add(new Tuple2<>(3, "香蕉"));
    products.add(new Tuple2<>(4, "石榴"));

    List<Tuple2<Integer, Integer>> counts = new ArrayList<>();
    counts.add(new Tuple2<>(1, 7));
    counts.add(new Tuple2<>(2, 3));
    counts.add(new Tuple2<>(3, 8));
    counts.add(new Tuple2<>(4, 3));
    counts.add(new Tuple2<>(5, 9));

    JavaPairRDD<Integer, String> productsRDD = sc.parallelizePairs(products);
    JavaPairRDD<Integer, Integer> countsRDD = sc.parallelizePairs(counts);

   /**
  	*  =================================================================================
    *   |            对<K, V>和<K, W>进行join操作,返回(K, (V, W))外连接函数为leftOuterJoin、rightOuterJoin和fullOuterJoin                    |
    *   |            For <K, V>, and <K, W> performs join operations and returns (K, (V, W)) the outer join functions are leftOuterJoin,  | 
    *   |            rightOuterJoin, and fullOuterJoin                                                                                                                                                     | 
  	*   =================================================================================
  	*/
    productsRDD.join(countsRDD)
            .foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception {
					   System.out.println(t._1 + "\t" + t._2());
				}
			});
}
 
Example #22
Source File: Runner.java    From envelope with Apache License 2.0 4 votes vote down vote up
/**
 * Run the Envelope pipeline as a Spark Streaming job.
 * @param steps The full configuration of the Envelope pipeline
 */
@SuppressWarnings("unchecked")
private void runStreaming(final Set<Step> steps) throws Exception {
  final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps);
  runBatch(independentNonStreamingSteps);

  Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps);
  for (final StreamingStep streamingStep : streamingSteps) {
    LOG.debug("Setting up streaming step: " + streamingStep.getName());

    JavaDStream stream = streamingStep.getStream();

    stream.foreachRDD(new VoidFunction<JavaRDD<?>>() {
      @Override
      public void call(JavaRDD<?> raw) throws Exception {
        // Some independent steps might be repeating steps that have been flagged for reload
        StepUtils.resetRepeatingSteps(steps);
        // This will run any batch steps (and dependents) that are not submitted
        runBatch(independentNonStreamingSteps);

        streamingStep.setData(streamingStep.translate(raw));
        streamingStep.writeData();
        streamingStep.setState(StepState.FINISHED);

        Set<Step> batchSteps = StepUtils.mergeLoadedSteps(steps, streamingStep, baseConfig);
        Set<Step> dependentSteps = StepUtils.getAllDependentSteps(streamingStep, batchSteps);
        batchSteps.add(streamingStep);
        batchSteps.addAll(streamingStep.loadNewBatchSteps());
        batchSteps.addAll(independentNonStreamingSteps);
        runBatch(batchSteps);

        StepUtils.resetSteps(dependentSteps);

        streamingStep.recordProgress(raw);
      }
    });

    LOG.debug("Finished setting up streaming step: " + streamingStep.getName());
  }

  JavaStreamingContext jsc = Contexts.getJavaStreamingContext();
  jsc.start();
  LOG.debug("Streaming context started");
  jsc.awaitTermination();
  LOG.debug("Streaming context terminated");
}
 
Example #23
Source File: TranslationUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
public static <T> VoidFunction<T> emptyVoidFunction() {
  return t -> {
    // Empty implementation.
  };
}
 
Example #24
Source File: StreamingIngestionFileSystemTextFileToDataframeApp.java    From net.jgp.labs.spark with Apache License 2.0 4 votes vote down vote up
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
    private static final long serialVersionUID = -590010339928376829L;

    @Override
    public void call(JavaRDD<String> rdd) {
      JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
        private static final long serialVersionUID = 5167089361335095997L;

        @Override
        public Row call(String msg) {
          Row row = RowFactory.create(msg);
          return row;
        }
      });
      // Create Schema
      StructType schema = DataTypes.createStructType(
          new StructField[] { DataTypes.createStructField("Message",
              DataTypes.StringType, true) });

      // Get Spark 2.0 session
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
          .getConf());
      Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
      msgDataFrame.show();
    }
  });

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example #25
Source File: ReduceByKey.java    From SparkDemo with MIT License 4 votes vote down vote up
/**
 * @category 统计文本单词个数
 * @param sc
 */
private static void reduceByKey(JavaSparkContext sc) {
	JavaRDD<String> lines = sc.textFile(Constant.LOCAL_FILE_PREX +"README.md");

	/**
	 *  ====================================================================================================== 
	 *   |                                                                     根据' '分词 扁平化输出Flatten output according to space word segmentation                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() {

		private static final long serialVersionUID = 1L;

		public Iterator<String> call(String line) throws Exception {
			List<String> words = Arrays.asList(line.split(" "));
			return words.iterator();
		}
	});

	/**
	 *  ====================================================================================================== 
	 *   |                                                                   将单词转换word:1的元组格式Converts the word to the tuple format of word:1                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaPairRDD<String, Integer> wordsCount = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {

		private static final long serialVersionUID = 1L;

		public Tuple2<String, Integer> call(String word) throws Exception {
			return new Tuple2<String, Integer>(word, 1);
		}
	});

	/**
	 *  ========================================================================================================= 
	 *   |                                                                根据元组Key(也就是单词)来分组Grouping according to the tuple Key (that is, the word)                                                                    | 
	 *   ========================================================================================================= 
	 */
	JavaPairRDD<String, Integer> resultRDD = wordsCount.reduceByKey(new Function2<Integer, Integer, Integer>() {

		private static final long serialVersionUID = 1L;

		public Integer call(Integer v1, Integer v2) throws Exception {
			return v1 + v2;
		}
	});

	resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {

		private static final long serialVersionUID = 1L;

		public void call(Tuple2<String, Integer> t) throws Exception {
			System.out.println(t._1 + "\t" + t._2());
		}
	});

	sc.close();
}
 
Example #26
Source File: JavaHBaseMapGetPutExample.java    From learning-hadoop with Apache License 2.0 4 votes vote down vote up
public static void main(String args[]) {
  if (args.length == 0) {
    System.out
        .println("JavaHBaseBulkGetExample  {master} {tableName}");
  }

  String master = args[0];
  String tableName = args[1];

  JavaSparkContext jsc = new JavaSparkContext(master,
      "JavaHBaseBulkGetExample");
  jsc.addJar("SparkHBase.jar");

  List<byte[]> list = new ArrayList<byte[]>();
  list.add(Bytes.toBytes("1"));
  list.add(Bytes.toBytes("2"));
  list.add(Bytes.toBytes("3"));
  list.add(Bytes.toBytes("4"));
  list.add(Bytes.toBytes("5"));

  //All Spark
  JavaRDD<byte[]> rdd = jsc.parallelize(list);

  //All HBase
  Configuration conf = HBaseConfiguration.create();
  conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
  conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

  //This is me
  JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

  //This is me
  hbaseContext.foreachPartition(rdd, null);
  
  hbaseContext.foreach(rdd, new VoidFunction<Tuple2<byte[], HConnection>>() {


    public void call(Tuple2<byte[], HConnection> t)
        throws Exception {
      HTableInterface table1 = t._2.getTable(Bytes.toBytes("Foo"));
      
      byte[] b = t._1;
      Result r = table1.get(new Get(b));
      if (r.getExists()) {
        table1.put(new Put(b));
      }
      
    }
  });
  
}
 
Example #27
Source File: SparkStreamingFromFlumeToHBaseWindowingExample.java    From SparkOnALog with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	int windowInSeconds = Integer.parseInt(args[5]);
	int slideInSeconds = Integer.parseInt(args[5]);
	
	Duration batchInterval = new Duration(2000);
	Duration windowInterval = new Duration(windowInSeconds * 1000);
	Duration slideInterval = new Duration(slideInSeconds * 1000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			}, windowInterval, slideInterval);
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}
 
Example #28
Source File: SparkStreamingFromFlumeToHBaseExample.java    From SparkOnALog with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseExample {master} {host} {port} {table} {columnFamily}");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	
	Duration batchInterval = new Duration(2000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKey(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			});
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}
 
Example #29
Source File: JavaStreamingTestExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: JavaStreamingTestExample " +
      "<dataDir> <batchDuration> <numBatchesTimeout>");
      System.exit(1);
  }

  String dataDir = args[0];
  Duration batchDuration = Seconds.apply(Long.parseLong(args[1]));
  int numBatchesTimeout = Integer.parseInt(args[2]);

  SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample");
  JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration);

  ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString());

  // $example on$
  JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map(
    new Function<String, BinarySample>() {
      @Override
      public BinarySample call(String line) {
        String[] ts = line.split(",");
        boolean label = Boolean.parseBoolean(ts[0]);
        double value = Double.parseDouble(ts[1]);
        return new BinarySample(label, value);
      }
    });

  StreamingTest streamingTest = new StreamingTest()
    .setPeacePeriod(0)
    .setWindowSize(0)
    .setTestMethod("welch");

  JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data);
  out.print();
  // $example off$

  // Stop processing if test becomes significant or we time out
  timeoutCounter = numBatchesTimeout;

  out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() {
    @Override
    public void call(JavaRDD<StreamingTestResult> rdd) {
      timeoutCounter -= 1;

      boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() {
        @Override
        public Boolean call(StreamingTestResult v) {
          return v.pValue() < 0.05;
        }
      }).isEmpty();

      if (timeoutCounter <= 0 || anySignificant) {
        rdd.context().stop();
      }
    }
  });

  ssc.start();
  ssc.awaitTermination();
}
 
Example #30
Source File: SampleConsumer.java    From kafka-spark-consumer with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
private void run() {

  Properties props = new Properties();
  props.put("zookeeper.hosts", "zkhost");
  props.put("zookeeper.port", "2181");
  props.put("kafka.topic", "topicA,topicB,topicC");
  props.put("kafka.consumer.id", "kafka-consumer");
  // Optional Properties
  props.put("zookeeper.broker.path", "/brokers");
  props.put("zookeeper.consumer.path", "/consumers");
  props.put("consumer.forcefromstart", "false");
  props.put("max.poll.records", "10");
  props.put("consumer.fillfreqms", "500");
  props.put("consumer.backpressure.enabled", "true");
  //Kafka properties
  props.put("bootstrap.servers", "kafkahost-1:6667,"
          + "kafkahost-2:6667,"
          + "kafkahost-3:6667,"
          + "kafkahost-4:6667");
  props.put("security.protocol", "SSL");
  props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks");
  props.put("ssl.truststore.password", "test1234");

  SparkConf _sparkConf = new SparkConf();
  JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30));
  // Specify number of Receivers you need.
  int numberOfReceivers = 6;

  JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch(
      jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY());

  unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() {
    @Override
    public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception {
      //Start Application Logic
      rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() {
          @Override
          public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception {
              int countTopicA = 0;
              int countTopicB = 0;
              int countTopicC = 0;
              while(mmItr.hasNext()) {
                  MessageAndMetadata<byte[]> mm = mmItr.next();
                  if(mm.getTopic().equals("topicA")) {
                      countTopicA++;
                  }
                  else if (mm.getTopic().equals("topicB")) {
                      countTopicB++;
                  }
                  else if (mm.getTopic().equals("topicC")) {
                      countTopicC++;
                  }
              }
              System.out.println("topicA count " + countTopicA);
              System.out.println("topicB count " + countTopicB);
              System.out.println("topicC count " + countTopicC);
          }
      });
      System.out.println("RDD count " + rdd.count());
      //End Application Logic
      //commit offset
      System.out.println("Commiting Offset");
      ProcessedOffsetManager.persistsPartition(rdd, props);
    }
  });

  try {
    jsc.start();
    jsc.awaitTermination();
  }catch (Exception ex ) {
    jsc.ssc().sc().cancelAllJobs();
    jsc.stop(true, false);
    System.exit(-1);
  }
}