Java Code Examples for org.apache.spark.api.java.function.Function

The following examples show how to use org.apache.spark.api.java.function.Function. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: sparkResearch   Source File: SparkStreamDemo.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //创建两个核心的本地线程,批处理的间隔为1秒
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng");
    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1));
    //创建一个连接到IP:localhost,PORT:8080的DStream
    JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080);
    JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return v1.contains("error");
        }
    });
    //打印包含error的行
    errorLine.print();
    try {
        //开始计算
        javaStreamingContext.start();
        //等待计算完成
        javaStreamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 2
Source Project: sparkResearch   Source File: LogError.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * 对日志进行 转换操作和行动操作
 */
public void log(JavaSparkContext sparkContext) {
    JavaRDD<String> inputRDD = sparkContext.textFile("/usr/local/log");
    JavaRDD<String> errorRDD = inputRDD.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return null;
        }
    });

    long errorRDDCount = errorRDD.count();
    System.out.println("errorRDD count is " + errorRDDCount);
    for (String rddLine : errorRDD.take(10)) {
        System.out.println("errorRDD 数据is " + rddLine);
    }
}
 
Example 3
Source Project: sparkResearch   Source File: SparkSqlApplication.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();

    Dataset<Row> dataset = sparkSession.read().json("D:\\a.txt");
    //只返回name字段
    dataset.select("name").show();
    //返回两个字段,所有age的value+1
    dataset.select(col("name"),col("age").plus(1)).show();
    //选择age大于21岁的人
    dataset.filter(col("age").gt(21)).show();
    //分组聚合,group age
    dataset.groupBy("age").count().show();
    //显示
    dataset.show();


    /*以编程的方式运行SQL查询*/
    //注册临时表
    dataset.createOrReplaceTempView("user");
    Dataset<Row> users = sparkSession.sql("SELECT * FROM user");

    JavaRDD<Object> toText = users.toJavaRDD().map((Function<Row, Object>) v1 -> v1.getString(0));
    users.show();
}
 
Example 4
Source Project: sparkResearch   Source File: BroadCastParam.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example 5
Source Project: kylin-on-parquet-v2   Source File: SparkUtil.java    License: Apache License 2.0 6 votes vote down vote up
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
    SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc()))
            .config(sc.getConf()).enableHiveSupport().getOrCreate();
    final Dataset intermediateTable = sparkSession.table(hiveTable);
    return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
        @Override
        public String[] call(Row row) throws Exception {
            String[] result = new String[row.size()];
            for (int i = 0; i < row.size(); i++) {
                final Object o = row.get(i);
                if (o != null) {
                    result[i] = o.toString();
                } else {
                    result[i] = null;
                }
            }
            return result;
        }
    });
}
 
Example 6
Source Project: kylin-on-parquet-v2   Source File: IteratorUtilsTest.java    License: Apache License 2.0 6 votes vote down vote up
private static ArrayList<Tuple2<Integer, Integer>> getResult(List<Tuple2<Integer, Integer>> list) {

        return Lists.newArrayList(IteratorUtils.merge(list.iterator(), new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return o1 - o2;
            }
        }, new Function<Iterable<Integer>, Integer>() {
            @Override
            public Integer call(Iterable<Integer> v1) throws Exception {
                int sum = 0;
                for (Integer integer : v1) {
                    sum += integer;
                }
                return sum;
            }
        }));
    }
 
Example 7
Source Project: vn.vitk   Source File: DependencyParser.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parses all sentences in an input file, each on a line and writes the result to 
 * the console window containing flattened dependency tuples.
 * @param jsc
 * @param inputFileName
 */
public void parse(JavaSparkContext jsc, String inputFileName) {
	List<String> sentences = jsc.textFile(inputFileName).collect();
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<String> rows = graphs.map(new Function<DependencyGraph, String>() {
		private static final long serialVersionUID = -6021310762521034121L;

		public String call(DependencyGraph graph) {
			return graph.dependencies();
		}
	});
	for (String s : rows.collect()) {
		System.out.println(s);
	}
}
 
Example 8
Source Project: beam   Source File: SparkCombineFn.java    License: Apache License 2.0 6 votes vote down vote up
SparkCombineFn(
    boolean global,
    Function<InputT, ValueT> toValue,
    CombineWithContext.CombineFnWithContext<ValueT, AccumT, OutputT> combineFn,
    SerializablePipelineOptions options,
    Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs,
    WindowingStrategy<?, ?> windowingStrategy) {
  this(
      global,
      toValue,
      combineFn,
      options,
      sideInputs,
      windowingStrategy,
      WindowedAccumulator.Type.EXPLODE_WINDOWS);
}
 
Example 9
Source Project: vn.vitk   Source File: Tagger.java    License: GNU General Public License v3.0 6 votes vote down vote up
private JavaRDD<String> toTaggedSentence(DataFrame output) {
	return output.javaRDD().map(new Function<Row, String>() {
		private static final long serialVersionUID = 4208643510231783579L;
		@Override
		public String call(Row row) throws Exception {
			String[] tokens = row.getString(0).trim().split("\\s+");
			String[] tags = row.getString(1).trim().split("\\s+");
			if (tokens.length != tags.length) {
				System.err.println("Incompatible lengths!");
				return null;
			}
			StringBuilder sb = new StringBuilder(64);
			for (int j = 0; j < tokens.length; j++) {
				sb.append(tokens[j]);
				sb.append('/');
				sb.append(tags[j]);
				sb.append(' ');
			}
			return sb.toString().trim();
		}
	});
}
 
Example 10
Source Project: SparkDemo   Source File: Filter.java    License: MIT License 6 votes vote down vote up
private static void filter(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> rddData = sc.parallelize(datas);
	JavaRDD<Integer> filterRDD = rddData.filter(
			// jdk1.8
			// v1 -> v1 >= 3
			new Function<Integer, Boolean>() {
				public Boolean call(Integer v) throws Exception {
					// 过滤小于4的数
					return v >= 4;
				}
			});

	filterRDD.foreach(
			// jdk1.8
			// v -> System.out.println(v)
			new VoidFunction<Integer>() {
				@Override
				public void call(Integer integer) throws Exception {
					System.out.println(integer);
				}
			});
	sc.close();
}
 
Example 11
Source Project: beam   Source File: SparkCombineFn.java    License: Apache License 2.0 6 votes vote down vote up
/** Create concrete accumulator for given type. */
static <InputT, ValueT, AccumT> WindowedAccumulator<InputT, ValueT, AccumT, ?> create(
    Function<InputT, ValueT> toValue,
    Type type,
    Iterable<WindowedValue<AccumT>> values,
    Comparator<BoundedWindow> windowComparator) {
  switch (type) {
    case MERGING:
      return MergingWindowedAccumulator.from(toValue, values, windowComparator);
    case NON_MERGING:
      return NonMergingWindowedAccumulator.from(toValue, values);
    case SINGLE_WINDOW:
    case EXPLODE_WINDOWS:
      Iterator<WindowedValue<AccumT>> iter = values.iterator();
      if (iter.hasNext()) {
        return SingleWindowWindowedAccumulator.create(toValue, iter.next());
      }
      return SingleWindowWindowedAccumulator.create(toValue);
    default:
      throw new IllegalArgumentException("Unknown type: " + type);
  }
}
 
Example 12
Source Project: vn.vitk   Source File: NGramBuilder.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
Example 13
Source Project: rheem   Source File: FunctionCompiler.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Create an appropriate {@link Function}-based predicate for deploying the given {@link PredicateDescriptor}
 * on Apache Spark.
 *
 * @param predicateDescriptor describes the function
 * @param operator            that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 * @param operatorContext     contains optimization information for the {@code operator}
 * @param inputs              that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 */
public <Type> Function<Type, Boolean> compile(
        PredicateDescriptor<Type> predicateDescriptor,
        SparkExecutionOperator operator,
        OptimizationContext.OperatorContext operatorContext,
        ChannelInstance[] inputs) {
    final Predicate<Type> javaImplementation = predicateDescriptor.getJavaImplementation();
    if (javaImplementation instanceof PredicateDescriptor.ExtendedSerializablePredicate) {
        return new ExtendedPredicateAdapater<>(
                (PredicateDescriptor.ExtendedSerializablePredicate<Type>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new PredicateAdapter<>(javaImplementation);
    }
}
 
Example 14
Source Project: BigDataPlatform   Source File: CaseWhenTest.java    License: GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("CaseWhenTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row>  gradeLevelDF = sqlContext.sql(
			"SELECT CASE "
				+ "WHEN grade>=90 THEN 'A' "
				+ "WHEN grade>=80 THEN 'B' "
				+ "WHEN grade>=70 THEN 'C' "
				+ "WHEN grade>=60 THEN 'D' "
				+ "ELSE 'E' "
				+ "END gradeLevel "
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 15
Source Project: BigDataPlatform   Source File: IfTest.java    License: GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("IfTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row> gradeLevelDF = sqlContext.sql(
			"SELECT IF(grade>=80,'GOOD','BAD') gradeLevel "  
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 16
Source Project: sparkResearch   Source File: CustomDataFrame.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}
 
Example 17
Source Project: kylin-on-parquet-v2   Source File: SparkUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) {
    return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values()
            .map(new Function<Text, String[]>() {
                @Override
                public String[] call(Text text) throws Exception {
                    String s = Bytes.toString(text.getBytes(), 0, text.getLength());
                    return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1);
                }
            });
}
 
Example 18
Source Project: spark-streaming-direct-kafka   Source File: Functions.java    License: Apache License 2.0 5 votes vote down vote up
public static <T> Function<T,T> identity() {
    return new Function<T,T>() {
        @Override
        public T call(T t) {
            return t;
        }
    };
}
 
Example 19
Source Project: incubator-nemo   Source File: SparkFrontendUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Converts a {@link PairFunction} to a plain map {@link Function}.
 *
 * @param pairFunction the pair function to convert.
 * @param <T>          the type of original element.
 * @param <K>          the type of converted key.
 * @param <V>          the type of converted value.
 * @return the converted map function.
 */
public static <T, K, V> Function<T, Tuple2<K, V>> pairFunctionToPlainFunction(
  final PairFunction<T, K, V> pairFunction) {
  return new Function<T, Tuple2<K, V>>() {
    @Override
    public Tuple2<K, V> call(final T elem) throws Exception {
      return pairFunction.call(elem);
    }
  };
}
 
Example 20
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner,
                                               final boolean mapSideCombine,
                                               final Serializer serializer) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 21
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 22
Source Project: incubator-nemo   Source File: SparkJavaPairRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final int numPartitions) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example 23
Source Project: lambda-arch   Source File: BatchProcessor.java    License: Apache License 2.0 5 votes vote down vote up
private static Function<Row, IoTData> getRowIoTDataFunction() {
    return row -> new IoTData(
                row.getString(6),
                row.getString(7),
                row.getString(3),
                row.getString(1),
                row.getString(2),
                row.getDate(5),
                row.getDouble(4),
                row.getDouble(0)
        );
}
 
Example 24
Source Project: beam   Source File: SparkCombineFn.java    License: Apache License 2.0 5 votes vote down vote up
static <InputT, ValueT, AccumT> WindowedAccumulator<InputT, ValueT, AccumT, ?> create(
    Function<InputT, ValueT> toValue, Type type, Comparator<BoundedWindow> windowComparator) {
  switch (type) {
    case MERGING:
      return MergingWindowedAccumulator.create(toValue, windowComparator);
    case NON_MERGING:
      return NonMergingWindowedAccumulator.create(toValue);
    case SINGLE_WINDOW:
    case EXPLODE_WINDOWS:
      return SingleWindowWindowedAccumulator.create(toValue);
    default:
      throw new IllegalArgumentException("Unknown type: " + type);
  }
}
 
Example 25
Source Project: Java-Data-Science-Cookbook   Source File: ScalaTest.java    License: MIT License 5 votes vote down vote up
public static void main( String[] args ){
	String inputFile = "data/dummy.txt";
	SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
	JavaSparkContext sparkContext = new JavaSparkContext(configuration);
	JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();

	long numberA = logData.filter(new Function<String,Boolean>(){
		private static final long serialVersionUID = 1L;
		public Boolean call(String s){
			return s.length() == 0;
		}
	}).count();
	sparkContext.close();
	System.out.println("Empty Lines: " + numberA);
}
 
Example 26
public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }
 
Example 27
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example 28
Source Project: SparkDemo   Source File: JavaGaussianMixtureExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/gmm_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using GaussianMixture
    GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());

    // Save and load GaussianMixtureModel
    gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel");
    GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(),
      "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel");

    // Output the parameters of the mixture model
    for (int j = 0; j < gmm.k(); j++) {
      System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
        gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
    }
    // $example off$

    jsc.stop();
  }
 
Example 29
Source Project: SparkDemo   Source File: JavaLogisticRegressionWithLBFGSExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}
 
Example 30
Source Project: SparkDemo   Source File: JavaSimpleFPGrowth.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  JavaSparkContext sc = SparkUtils.getLocalSparkContext(JavaSimpleFPGrowth.class);

  // $example on$
  JavaRDD<String> data = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/mllib/sample_fpgrowth.txt");

  JavaRDD<List<String>> transactions = data.map(
    new Function<String, List<String>>() {
      public List<String> call(String line) {
        String[] parts = line.split(" ");
        return Arrays.asList(parts);
      }
    }
  );

  FPGrowth fpg = new FPGrowth()
    .setMinSupport(0.2)
    .setNumPartitions(10);
  FPGrowthModel<String> model = fpg.run(transactions);

  for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().collect()) {
    System.out.println("[" + itemset.javaItems() + "], " + itemset.freq());
  }

  double minConfidence = 0.8;
  for (AssociationRules.Rule<String> rule
    : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) {
    System.out.println(
      rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
  }
  // $example off$

  sc.stop();
}