Java Code Examples for org.apache.spark.api.java.function.VoidFunction

The following examples show how to use org.apache.spark.api.java.function.VoidFunction. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: sparkResearch   Source File: BroadCastParam.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example 2
Source Project: sparkResearch   Source File: Accumulator.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //日志输出级别
    javaSparkContext.setLogLevel("ERROR");
    //创建RDD
    JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache();

    AttackAccumulator attackAccumulator = new AttackAccumulator();
    //注册累加器
    javaSparkContext.sc().register(attackAccumulator, "attack_count");
    //生成一个随机数作为value
    JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> {
        Integer random = new Random().nextInt(10);
        return new Tuple2<>(s, s + ":" + random);
    });

    javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> {
        attackAccumulator.add(tuple2._2);
    });
    System.out.println(attackAccumulator.value());
}
 
Example 3
Source Project: SparkDemo   Source File: Filter.java    License: MIT License 6 votes vote down vote up
private static void filter(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> rddData = sc.parallelize(datas);
	JavaRDD<Integer> filterRDD = rddData.filter(
			// jdk1.8
			// v1 -> v1 >= 3
			new Function<Integer, Boolean>() {
				public Boolean call(Integer v) throws Exception {
					// 过滤小于4的数
					return v >= 4;
				}
			});

	filterRDD.foreach(
			// jdk1.8
			// v -> System.out.println(v)
			new VoidFunction<Integer>() {
				@Override
				public void call(Integer integer) throws Exception {
					System.out.println(integer);
				}
			});
	sc.close();
}
 
Example 4
Source Project: SparkDemo   Source File: Cartesian.java    License: MIT License 6 votes vote down vote up
private static void cartesian(JavaSparkContext sc) {
    List<String> names = Arrays.asList("张三", "李四", "王五");
    List<Integer> scores = Arrays.asList(60, 70, 80);

    JavaRDD<String> namesRDD = sc.parallelize(names);
    JavaRDD<Integer> scoreRDD = sc.parallelize(scores);

    /**
	 *  =====================================
	 *   |             两个RDD进行笛卡尔积合并                                        |
	 *   |             The two RDD are Cartesian product merging     |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
    JavaPairRDD<String, Integer> cartesianRDD = namesRDD.cartesian(scoreRDD);
    
    cartesianRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
        public void call(Tuple2<String, Integer> t) throws Exception {
            System.out.println(t._1 + "\t" + t._2());
        }
    });
}
 
Example 5
Source Project: SparkDemo   Source File: FlatMap.java    License: MIT License 6 votes vote down vote up
private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}
 
Example 6
Source Project: SparkDemo   Source File: Union.java    License: MIT License 6 votes vote down vote up
static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD,不去重,要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}
 
Example 7
Source Project: SparkDemo   Source File: Distinct.java    License: MIT License 6 votes vote down vote up
private static void distinct(JavaSparkContext sc) {
	List<String> datas = Arrays.asList("张三", "李四", "tom", "张三");

	 /**
	 *  ===================================
	 *   |      去重--包含shuffle操作                                                 |
	 *   |      Remove weights, including shuffle operations    |                                                                                                                                                                                                                                    | 
	 *   ===================================
	 */
	JavaRDD<String> distinctRDD = sc.parallelize(datas).distinct();
	
	distinctRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});
}
 
Example 8
Source Project: SparkDemo   Source File: SampleAndTake.java    License: MIT License 6 votes vote down vote up
static void sample(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> dataRDD = sc.parallelize(datas);
	
	/**
	 *  ====================================================================================================== 
	 *   |                   随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子                       |
	 *   |                   The random  sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times,  |
	 *   |                   and false indicates no return.  Fraction represents the sampling proportion;seed is the random number seed                                                               |                                                                                                                                                                                                                                           | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis());
	
	// TODO dataRDD.takeSample(false, 3);
	// TODO dataRDD.take(3)

	sampleRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer t) throws Exception {
			System.out.println(t);
		}
	});

	sc.close();
}
 
Example 9
Source Project: SparkDemo   Source File: Intersection.java    License: MIT License 6 votes vote down vote up
static void intersection(JavaSparkContext sc) {
	List<String> datas1 = Arrays.asList("张三", "李四", "tom");
	List<String> datas2 = Arrays.asList("tom", "gim");

	/**
	 *  =====================================
	 *   |             返回两个RDD的交集                                                   |
	 *   |             Returns the intersection of two RDD                    |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
	JavaRDD<String> intersectionRDD = sc.parallelize(datas1).intersection(sc.parallelize(datas2));

	intersectionRDD.foreach(new VoidFunction<String>() {

		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

}
 
Example 10
Source Project: HadoopCV   Source File: InputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]");
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	Configuration hc = new org.apache.hadoop.conf.Configuration();
	JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc);
	
	video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() {	
		@Override
		public void call(Tuple2<Text, HBMat> tuple) throws Exception {
			HBMat image = (HBMat)tuple._2;
			System.out.print(image.getBmat().dump());
		}
	});
	
	System.out.print(video.count());
}
 
Example 11
Source Project: SparkDemo   Source File: Join.java    License: MIT License 5 votes vote down vote up
static void join(JavaSparkContext sc) {
    List<Tuple2<Integer, String>> products = new ArrayList<>();
    products.add(new Tuple2<>(1, "苹果"));
    products.add(new Tuple2<>(2, "梨"));
    products.add(new Tuple2<>(3, "香蕉"));
    products.add(new Tuple2<>(4, "石榴"));

    List<Tuple2<Integer, Integer>> counts = new ArrayList<>();
    counts.add(new Tuple2<>(1, 7));
    counts.add(new Tuple2<>(2, 3));
    counts.add(new Tuple2<>(3, 8));
    counts.add(new Tuple2<>(4, 3));
    counts.add(new Tuple2<>(5, 9));

    JavaPairRDD<Integer, String> productsRDD = sc.parallelizePairs(products);
    JavaPairRDD<Integer, Integer> countsRDD = sc.parallelizePairs(counts);

   /**
  	*  =================================================================================
    *   |            对<K, V>和<K, W>进行join操作,返回(K, (V, W))外连接函数为leftOuterJoin、rightOuterJoin和fullOuterJoin                    |
    *   |            For <K, V>, and <K, W> performs join operations and returns (K, (V, W)) the outer join functions are leftOuterJoin,  | 
    *   |            rightOuterJoin, and fullOuterJoin                                                                                                                                                     | 
  	*   =================================================================================
  	*/
    productsRDD.join(countsRDD)
            .foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception {
					   System.out.println(t._1 + "\t" + t._2());
				}
			});
}
 
Example 12
Source Project: SparkDemo   Source File: Cogroup.java    License: MIT License 5 votes vote down vote up
private static void cogroup(JavaSparkContext sc) {
	List<Tuple2<Integer, String>> datas1 = new ArrayList<>();
	datas1.add(new Tuple2<>(1, "苹果"));
	datas1.add(new Tuple2<>(2, "梨"));
	datas1.add(new Tuple2<>(3, "香蕉"));
	datas1.add(new Tuple2<>(4, "石榴"));

	List<Tuple2<Integer, Integer>> datas2 = new ArrayList<>();
	datas2.add(new Tuple2<>(1, 7));
	datas2.add(new Tuple2<>(2, 3));
	datas2.add(new Tuple2<>(3, 8));
	datas2.add(new Tuple2<>(4, 3));

	List<Tuple2<Integer, String>> datas3 = new ArrayList<>();
	datas3.add(new Tuple2<>(1, "7"));
	datas3.add(new Tuple2<>(2, "3"));
	datas3.add(new Tuple2<>(3, "8"));
	datas3.add(new Tuple2<>(4, "3"));
	datas3.add(new Tuple2<>(4, "4"));
	datas3.add(new Tuple2<>(4, "5"));
	datas3.add(new Tuple2<>(4, "6"));

	/**
	 *   ===================================================== =========================
	 *   |    Cogroup: groups the elements in the same key in each RDD into a collection of KV elements in each RDD.                   |
	 *   |    Unlike reduceByKey, the elements of the same key are merged in the two RDD.                                                                   | 
	 *   ===============================================================================
	 */
	sc.parallelizePairs(datas1).cogroup(sc.parallelizePairs(datas2), sc.parallelizePairs(datas3)).foreach(
			new VoidFunction<Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<String>>> t)
						throws Exception {
					System.out.println(t._1 + "==" + t._2);
				}
			});
}
 
Example 13
Source Project: SparkDemo   Source File: JDBCDataSource.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
//		SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
		JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class);
		SQLContext sqlContext = new SQLContext(sc);

		Map<String, String> options = new HashMap<String, String>();
		options.put("url", "jdbc:mysql://192.168.2.129:3306/hive");
		options.put("dbtable", "t_user");
		options.put("user", "root");
		options.put("password", "666666");

		// 加载jdbc数据配置信息 并不会立即连接数据库
		Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load();

		//		options.put("dbtable", "tb_item");
		//		DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load();

		// 读取jdbc表数据
		dataset1.javaRDD().foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row row) throws Exception {
				System.out.println(row);
			}
		});


		// 将RDD数据存储到MYSQL中
		saveToMysql( sqlContext, options);

		sc.close();
	}
 
Example 14
Source Project: SparkDemo   Source File: JDBCDataSource.java    License: MIT License 5 votes vote down vote up
/**
 * @category 将RDD的数据存储到Mysql数据库中
 * @param sqlContext
 * @param options
 */
private static void saveToMysql( SQLContext sqlContext, Map<String, String> options) {
	options.put("url", "jdbc:mysql://192.168.68.1:3306/tourismdb");
	options.put("dbtable", "t_user");
	Dataset<Row> dataset = sqlContext.read().format("jdbc").options(options).load();

	dataset.javaRDD().foreach(new VoidFunction<Row>() {
		@Override
		public void call(Row row) throws Exception {
			String sql = "insert into t_user( name, password, phone, email,type,status,del) values("
					+ "'"+ row.getString(1) + "'," 
					+ "'"+ row.getString(2) + "'," 
					+ "'"+ row.getString(3) + "'," 
					+ "'"+ row.getString(4) + "'," 
					+ row.getInt(5)+ ","
					+ row.getInt(6)+ ","
					+ row.getInt(7)+ ")";
			System.out.println(sql);
			Class.forName("com.mysql.jdbc.Driver");
			Connection conn = null;
			Statement statement = null;
			try {
				conn = DriverManager.getConnection("jdbc:mysql://192.168.68.129:3306/sparkdemo","root","666666");
				statement = conn.createStatement();
				statement.executeUpdate(sql);
			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				if(statement!=null){
					statement.close();
				}
				if (conn!=null) {
					conn.close();
				}
			}
		}
	});
}
 
Example 15
Source Project: SparkDemo   Source File: AccumulatorValue.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(AccumulatorValue.class);

	// 创建累加器
	final Accumulator<Integer> accumulator = sc.accumulator(0, "My Accumulator");

	List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
	JavaRDD<Integer> listRDD = sc.parallelize(list);

	listRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer n) throws Exception {
			accumulator.add(n);
			// 不能读取,会报异常 cannot read, you will report an exception
			// System.out.println(accumulator.value());
		}
	});

	// 只能在Driver读取
	System.out.println(accumulator.value());
	
	try {
		Thread.sleep(5000*5000*5000);
		// http://192.168.68.1:4040
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	sc.close();
}
 
Example 16
Source Project: spark-streaming-direct-kafka   Source File: Functions.java    License: Apache License 2.0 5 votes vote down vote up
public static <T> VoidFunction<T> noOp() {
    return new VoidFunction<T>() {
        @Override
        public void call(T t) {
            // do nothing
        }
    };
}
 
Example 17
private void encryptedColumnCalc(JavaPairDStream<Long,BigInteger> encRowRDD)
{
  // Multiply the column values by colNum: emit <colNum, finalColVal>
  JavaPairDStream<Long,BigInteger> encColRDD;
  if (colMultReduceByKey)
  {
    encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions);
  }
  else
  {
    encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars));
  }

  // Update the output name, by batch number
  bVars.setOutput(outputFile + "_" + accum.numBatchesGetValue());

  // Form and write the response object
  encColRDD.repartition(1).foreachRDD((VoidFunction<JavaPairRDD<Long,BigInteger>>) rdd -> {
    rdd.foreachPartition(new FinalResponseFunction(accum, bVars));

    int maxBatchesVar = bVars.getMaxBatches();
    if (maxBatchesVar != -1 && accum.numBatchesGetValue() == maxBatchesVar)
    {
      logger.info("num batches = maxBatches = " + maxBatchesVar + "; shutting down");
      System.exit(0);
    }

  });
}
 
Example 18
@SuppressWarnings("deprecation")
public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) {
  partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() {
    @Override
    public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example 19
@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag =
      ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset =
      new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}
 
Example 20
public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example 21
public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));
        SpliceSpark.setContext(ssc.sparkContext());

        SparkSession spark = SpliceSpark.getSessionUnsafe();

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }
 
Example 22
Source Project: BigDataArchitect   Source File: WordCountJava.java    License: Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

        SparkConf conf = new SparkConf();
        conf.setAppName("java-wordcount");
        conf.setMaster("local");

        JavaSparkContext jsc = new JavaSparkContext(conf);

        JavaRDD<String> fileRDD = jsc.textFile("bigdata-spark/data/testdata.txt");

        JavaRDD<String> words = fileRDD.flatMap(new FlatMapFunction<String, String>() {
            public Iterator<String> call(String line) throws Exception {
                return Arrays.asList(line.split(" ")).iterator();
            }
        });

        JavaPairRDD<String, Integer> pairWord = words.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        });

        JavaPairRDD<String, Integer> res = pairWord.reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer oldV, Integer v) throws Exception {
                return oldV + v;
            }
        });

        res.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            public void call(Tuple2<String, Integer> value) throws Exception {
                System.out.println(value._1+"\t"+value._2);
            }
        });

//
//        RandomAccessFile rfile = new RandomAccessFile("ooxx","rw");
//
////        rfile.seek(222);
//        FileChannel channel = rfile.getChannel();
//        //  linux  fd   write(fd)  read(fd)
//
//
//        ByteBuffer b1 = ByteBuffer.allocate(1024);
//        ByteBuffer b2 = ByteBuffer.allocateDirect(1024);
//        MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_WRITE, 80, 120);
//


    }
 
Example 23
/**
	 * @param args
	 */
	public static void main(String[] args) {
		//C:\Users\sumit.kumar\Downloads\bin\warehouse
		//System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads");
		String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system
		Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN);
		 SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false");
			JavaSparkContext sparkContext = new JavaSparkContext(conf);
		    JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache();	
		    JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() {
			@Override
			public Boolean call(Integer v1) throws Exception {
			  return ((v1%2)==0)?true:false;
				}
			});
		    
		    evenRDD.persist(StorageLevel.MEMORY_AND_DISK());
		    evenRDD.foreach(new VoidFunction<Integer>() {
			@Override
			public void call(Integer t) throws Exception {
			System.out.println("The value of RDD are :"+t);
			 }
			});
		   //unpersisting the RDD 
		   evenRDD.unpersist();
		   rdd.unpersist();
		   
		   /* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache();
		    System.out.println("DEBUG: \n"+ lines.toDebugString());
		   long word= lines.count();
		   JavaRDD<String> distinctLines=lines.distinct();
		   System.out.println("DEBUG: \n"+ distinctLines.toDebugString());
		   JavaRDD<String> finalRdd=lines.subtract(distinctLines);
		    
		   
		   System.out.println("DEBUG: \n"+ finalRdd.toDebugString());
		   System.out.println("The count is "+word);
		   System.out.println("The count is "+distinctLines.count());
		   System.out.println("The count is "+finalRdd.count());
		   
		   finalRdd.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String t) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(t);
			}
		});
*/	    /*SparkConf conf = new SparkConf().setAppName("Simple Application");
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    StorageLevel newLevel;
		JavaRDD<String> logData = sc.textFile(logFile).cache();

	    long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("a"); }
	    }).count();

	    long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("b"); }
	    }).count();

	    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
	    
	    sc.stop();*/

	}
 
Example 24
Source Project: SparkDemo   Source File: JavaStreamingTestExample.java    License: MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: JavaStreamingTestExample " +
      "<dataDir> <batchDuration> <numBatchesTimeout>");
      System.exit(1);
  }

  String dataDir = args[0];
  Duration batchDuration = Seconds.apply(Long.parseLong(args[1]));
  int numBatchesTimeout = Integer.parseInt(args[2]);

  SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample");
  JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration);

  ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString());

  // $example on$
  JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map(
    new Function<String, BinarySample>() {
      @Override
      public BinarySample call(String line) {
        String[] ts = line.split(",");
        boolean label = Boolean.parseBoolean(ts[0]);
        double value = Double.parseDouble(ts[1]);
        return new BinarySample(label, value);
      }
    });

  StreamingTest streamingTest = new StreamingTest()
    .setPeacePeriod(0)
    .setWindowSize(0)
    .setTestMethod("welch");

  JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data);
  out.print();
  // $example off$

  // Stop processing if test becomes significant or we time out
  timeoutCounter = numBatchesTimeout;

  out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() {
    @Override
    public void call(JavaRDD<StreamingTestResult> rdd) {
      timeoutCounter -= 1;

      boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() {
        @Override
        public Boolean call(StreamingTestResult v) {
          return v.pValue() < 0.05;
        }
      }).isEmpty();

      if (timeoutCounter <= 0 || anySignificant) {
        rdd.context().stop();
      }
    }
  });

  ssc.start();
  ssc.awaitTermination();
}
 
Example 25
Source Project: SparkDemo   Source File: ReduceByKey.java    License: MIT License 4 votes vote down vote up
/**
 * @category 统计文本单词个数
 * @param sc
 */
private static void reduceByKey(JavaSparkContext sc) {
	JavaRDD<String> lines = sc.textFile(Constant.LOCAL_FILE_PREX +"README.md");

	/**
	 *  ====================================================================================================== 
	 *   |                                                                     根据' '分词 扁平化输出Flatten output according to space word segmentation                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() {

		private static final long serialVersionUID = 1L;

		public Iterator<String> call(String line) throws Exception {
			List<String> words = Arrays.asList(line.split(" "));
			return words.iterator();
		}
	});

	/**
	 *  ====================================================================================================== 
	 *   |                                                                   将单词转换word:1的元组格式Converts the word to the tuple format of word:1                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaPairRDD<String, Integer> wordsCount = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {

		private static final long serialVersionUID = 1L;

		public Tuple2<String, Integer> call(String word) throws Exception {
			return new Tuple2<String, Integer>(word, 1);
		}
	});

	/**
	 *  ========================================================================================================= 
	 *   |                                                                根据元组Key(也就是单词)来分组Grouping according to the tuple Key (that is, the word)                                                                    | 
	 *   ========================================================================================================= 
	 */
	JavaPairRDD<String, Integer> resultRDD = wordsCount.reduceByKey(new Function2<Integer, Integer, Integer>() {

		private static final long serialVersionUID = 1L;

		public Integer call(Integer v1, Integer v2) throws Exception {
			return v1 + v2;
		}
	});

	resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {

		private static final long serialVersionUID = 1L;

		public void call(Tuple2<String, Integer> t) throws Exception {
			System.out.println(t._1 + "\t" + t._2());
		}
	});

	sc.close();
}
 
Example 26
Source Project: envelope   Source File: Runner.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Run the Envelope pipeline as a Spark Streaming job.
 * @param steps The full configuration of the Envelope pipeline
 */
@SuppressWarnings("unchecked")
private void runStreaming(final Set<Step> steps) throws Exception {
  final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps);
  runBatch(independentNonStreamingSteps);

  Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps);
  for (final StreamingStep streamingStep : streamingSteps) {
    LOG.debug("Setting up streaming step: " + streamingStep.getName());

    JavaDStream stream = streamingStep.getStream();

    stream.foreachRDD(new VoidFunction<JavaRDD<?>>() {
      @Override
      public void call(JavaRDD<?> raw) throws Exception {
        // Some independent steps might be repeating steps that have been flagged for reload
        StepUtils.resetRepeatingSteps(steps);
        // This will run any batch steps (and dependents) that are not submitted
        runBatch(independentNonStreamingSteps);

        streamingStep.setData(streamingStep.translate(raw));
        streamingStep.writeData();
        streamingStep.setState(StepState.FINISHED);

        Set<Step> batchSteps = StepUtils.mergeLoadedSteps(steps, streamingStep, baseConfig);
        Set<Step> dependentSteps = StepUtils.getAllDependentSteps(streamingStep, batchSteps);
        batchSteps.add(streamingStep);
        batchSteps.addAll(streamingStep.loadNewBatchSteps());
        batchSteps.addAll(independentNonStreamingSteps);
        runBatch(batchSteps);

        StepUtils.resetSteps(dependentSteps);

        streamingStep.recordProgress(raw);
      }
    });

    LOG.debug("Finished setting up streaming step: " + streamingStep.getName());
  }

  JavaStreamingContext jsc = Contexts.getJavaStreamingContext();
  jsc.start();
  LOG.debug("Streaming context started");
  jsc.awaitTermination();
  LOG.debug("Streaming context terminated");
}
 
Example 27
Source Project: beam   Source File: TranslationUtils.java    License: Apache License 2.0 4 votes vote down vote up
public static <T> VoidFunction<T> emptyVoidFunction() {
  return t -> {
    // Empty implementation.
  };
}
 
Example 28
private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
    private static final long serialVersionUID = -590010339928376829L;

    @Override
    public void call(JavaRDD<String> rdd) {
      JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
        private static final long serialVersionUID = 5167089361335095997L;

        @Override
        public Row call(String msg) {
          Row row = RowFactory.create(msg);
          return row;
        }
      });
      // Create Schema
      StructType schema = DataTypes.createStructType(
          new StructField[] { DataTypes.createStructField("Message",
              DataTypes.StringType, true) });

      // Get Spark 2.0 session
      SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
          .getConf());
      Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
      msgDataFrame.show();
    }
  });

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}
 
Example 29
Source Project: learning-hadoop   Source File: JavaHBaseMapGetPutExample.java    License: Apache License 2.0 4 votes vote down vote up
public static void main(String args[]) {
  if (args.length == 0) {
    System.out
        .println("JavaHBaseBulkGetExample  {master} {tableName}");
  }

  String master = args[0];
  String tableName = args[1];

  JavaSparkContext jsc = new JavaSparkContext(master,
      "JavaHBaseBulkGetExample");
  jsc.addJar("SparkHBase.jar");

  List<byte[]> list = new ArrayList<byte[]>();
  list.add(Bytes.toBytes("1"));
  list.add(Bytes.toBytes("2"));
  list.add(Bytes.toBytes("3"));
  list.add(Bytes.toBytes("4"));
  list.add(Bytes.toBytes("5"));

  //All Spark
  JavaRDD<byte[]> rdd = jsc.parallelize(list);

  //All HBase
  Configuration conf = HBaseConfiguration.create();
  conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
  conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

  //This is me
  JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

  //This is me
  hbaseContext.foreachPartition(rdd, null);
  
  hbaseContext.foreach(rdd, new VoidFunction<Tuple2<byte[], HConnection>>() {


    public void call(Tuple2<byte[], HConnection> t)
        throws Exception {
      HTableInterface table1 = t._2.getTable(Bytes.toBytes("Foo"));
      
      byte[] b = t._1;
      Result r = table1.get(new Get(b));
      if (r.getExists()) {
        table1.put(new Put(b));
      }
      
    }
  });
  
}
 
Example 30
public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	int windowInSeconds = Integer.parseInt(args[5]);
	int slideInSeconds = Integer.parseInt(args[5]);
	
	Duration batchInterval = new Duration(2000);
	Duration windowInterval = new Duration(windowInSeconds * 1000);
	Duration slideInterval = new Duration(slideInSeconds * 1000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			}, windowInterval, slideInterval);
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}