org.apache.spark.api.java.function.Function Java Examples

The following examples show how to use org.apache.spark.api.java.function.Function. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkCombineFn.java    From beam with Apache License 2.0 6 votes vote down vote up
SparkCombineFn(
    boolean global,
    Function<InputT, ValueT> toValue,
    CombineWithContext.CombineFnWithContext<ValueT, AccumT, OutputT> combineFn,
    SerializablePipelineOptions options,
    Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs,
    WindowingStrategy<?, ?> windowingStrategy) {
  this(
      global,
      toValue,
      combineFn,
      options,
      sideInputs,
      windowingStrategy,
      WindowedAccumulator.Type.EXPLODE_WINDOWS);
}
 
Example #2
Source File: SparkStreamDemo.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //创建两个核心的本地线程,批处理的间隔为1秒
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng");
    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1));
    //创建一个连接到IP:localhost,PORT:8080的DStream
    JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080);
    JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return v1.contains("error");
        }
    });
    //打印包含error的行
    errorLine.print();
    try {
        //开始计算
        javaStreamingContext.start();
        //等待计算完成
        javaStreamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #3
Source File: FunctionCompiler.java    From rheem with Apache License 2.0 6 votes vote down vote up
/**
 * Create an appropriate {@link Function}-based predicate for deploying the given {@link PredicateDescriptor}
 * on Apache Spark.
 *
 * @param predicateDescriptor describes the function
 * @param operator            that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 * @param operatorContext     contains optimization information for the {@code operator}
 * @param inputs              that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 */
public <Type> Function<Type, Boolean> compile(
        PredicateDescriptor<Type> predicateDescriptor,
        SparkExecutionOperator operator,
        OptimizationContext.OperatorContext operatorContext,
        ChannelInstance[] inputs) {
    final Predicate<Type> javaImplementation = predicateDescriptor.getJavaImplementation();
    if (javaImplementation instanceof PredicateDescriptor.ExtendedSerializablePredicate) {
        return new ExtendedPredicateAdapater<>(
                (PredicateDescriptor.ExtendedSerializablePredicate<Type>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new PredicateAdapter<>(javaImplementation);
    }
}
 
Example #4
Source File: LogError.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 对日志进行 转换操作和行动操作
 */
public void log(JavaSparkContext sparkContext) {
    JavaRDD<String> inputRDD = sparkContext.textFile("/usr/local/log");
    JavaRDD<String> errorRDD = inputRDD.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return null;
        }
    });

    long errorRDDCount = errorRDD.count();
    System.out.println("errorRDD count is " + errorRDDCount);
    for (String rddLine : errorRDD.take(10)) {
        System.out.println("errorRDD 数据is " + rddLine);
    }
}
 
Example #5
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
private JavaRDD<String> toTaggedSentence(DataFrame output) {
	return output.javaRDD().map(new Function<Row, String>() {
		private static final long serialVersionUID = 4208643510231783579L;
		@Override
		public String call(Row row) throws Exception {
			String[] tokens = row.getString(0).trim().split("\\s+");
			String[] tags = row.getString(1).trim().split("\\s+");
			if (tokens.length != tags.length) {
				System.err.println("Incompatible lengths!");
				return null;
			}
			StringBuilder sb = new StringBuilder(64);
			for (int j = 0; j < tokens.length; j++) {
				sb.append(tokens[j]);
				sb.append('/');
				sb.append(tags[j]);
				sb.append(' ');
			}
			return sb.toString().trim();
		}
	});
}
 
Example #6
Source File: SparkSqlApplication.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();

    Dataset<Row> dataset = sparkSession.read().json("D:\\a.txt");
    //只返回name字段
    dataset.select("name").show();
    //返回两个字段,所有age的value+1
    dataset.select(col("name"),col("age").plus(1)).show();
    //选择age大于21岁的人
    dataset.filter(col("age").gt(21)).show();
    //分组聚合,group age
    dataset.groupBy("age").count().show();
    //显示
    dataset.show();


    /*以编程的方式运行SQL查询*/
    //注册临时表
    dataset.createOrReplaceTempView("user");
    Dataset<Row> users = sparkSession.sql("SELECT * FROM user");

    JavaRDD<Object> toText = users.toJavaRDD().map((Function<Row, Object>) v1 -> v1.getString(0));
    users.show();
}
 
Example #7
Source File: BroadCastParam.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example #8
Source File: DependencyParser.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parses all sentences in an input file, each on a line and writes the result to 
 * the console window containing flattened dependency tuples.
 * @param jsc
 * @param inputFileName
 */
public void parse(JavaSparkContext jsc, String inputFileName) {
	List<String> sentences = jsc.textFile(inputFileName).collect();
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<String> rows = graphs.map(new Function<DependencyGraph, String>() {
		private static final long serialVersionUID = -6021310762521034121L;

		public String call(DependencyGraph graph) {
			return graph.dependencies();
		}
	});
	for (String s : rows.collect()) {
		System.out.println(s);
	}
}
 
Example #9
Source File: NGramBuilder.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
Example #10
Source File: SparkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
    SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc()))
            .config(sc.getConf()).enableHiveSupport().getOrCreate();
    final Dataset intermediateTable = sparkSession.table(hiveTable);
    return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
        @Override
        public String[] call(Row row) throws Exception {
            String[] result = new String[row.size()];
            for (int i = 0; i < row.size(); i++) {
                final Object o = row.get(i);
                if (o != null) {
                    result[i] = o.toString();
                } else {
                    result[i] = null;
                }
            }
            return result;
        }
    });
}
 
Example #11
Source File: IteratorUtilsTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private static ArrayList<Tuple2<Integer, Integer>> getResult(List<Tuple2<Integer, Integer>> list) {

        return Lists.newArrayList(IteratorUtils.merge(list.iterator(), new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return o1 - o2;
            }
        }, new Function<Iterable<Integer>, Integer>() {
            @Override
            public Integer call(Iterable<Integer> v1) throws Exception {
                int sum = 0;
                for (Integer integer : v1) {
                    sum += integer;
                }
                return sum;
            }
        }));
    }
 
Example #12
Source File: SparkCombineFn.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Create concrete accumulator for given type. */
static <InputT, ValueT, AccumT> WindowedAccumulator<InputT, ValueT, AccumT, ?> create(
    Function<InputT, ValueT> toValue,
    Type type,
    Iterable<WindowedValue<AccumT>> values,
    Comparator<BoundedWindow> windowComparator) {
  switch (type) {
    case MERGING:
      return MergingWindowedAccumulator.from(toValue, values, windowComparator);
    case NON_MERGING:
      return NonMergingWindowedAccumulator.from(toValue, values);
    case SINGLE_WINDOW:
    case EXPLODE_WINDOWS:
      Iterator<WindowedValue<AccumT>> iter = values.iterator();
      if (iter.hasNext()) {
        return SingleWindowWindowedAccumulator.create(toValue, iter.next());
      }
      return SingleWindowWindowedAccumulator.create(toValue);
    default:
      throw new IllegalArgumentException("Unknown type: " + type);
  }
}
 
Example #13
Source File: Filter.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void filter(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> rddData = sc.parallelize(datas);
	JavaRDD<Integer> filterRDD = rddData.filter(
			// jdk1.8
			// v1 -> v1 >= 3
			new Function<Integer, Boolean>() {
				public Boolean call(Integer v) throws Exception {
					// 过滤小于4的数
					return v >= 4;
				}
			});

	filterRDD.foreach(
			// jdk1.8
			// v -> System.out.println(v)
			new VoidFunction<Integer>() {
				@Override
				public void call(Integer integer) throws Exception {
					System.out.println(integer);
				}
			});
	sc.close();
}
 
Example #14
Source File: SparkCombineFn.java    From beam with Apache License 2.0 5 votes vote down vote up
SingleWindowWindowedAccumulator(
    Function<InputT, ValueT> toValue, WindowedValue<AccumT> accumulator) {
  this.toValue = toValue;
  this.windowAccumulator = accumulator.getValue();
  this.accTimestamp =
      accumulator.getTimestamp().equals(BoundedWindow.TIMESTAMP_MIN_VALUE)
          ? null
          : accumulator.getTimestamp();
  this.accWindow = getWindow(accumulator);
}
 
Example #15
Source File: CustomDataFrame.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}
 
Example #16
Source File: BatchProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
private static Function<Row, IoTData> getRowIoTDataFunction() {
    return row -> new IoTData(
                row.getString(6),
                row.getString(7),
                row.getString(3),
                row.getString(1),
                row.getString(2),
                row.getDate(5),
                row.getDouble(4),
                row.getDouble(0)
        );
}
 
Example #17
Source File: SparkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) {
    return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values()
            .map(new Function<Text, String[]>() {
                @Override
                public String[] call(Text text) throws Exception {
                    String s = Bytes.toString(text.getBytes(), 0, text.getLength());
                    return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1);
                }
            });
}
 
Example #18
Source File: SparkFrontendUtils.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Converts a {@link PairFunction} to a plain map {@link Function}.
 *
 * @param pairFunction the pair function to convert.
 * @param <T>          the type of original element.
 * @param <K>          the type of converted key.
 * @param <V>          the type of converted value.
 * @return the converted map function.
 */
public static <T, K, V> Function<T, Tuple2<K, V>> pairFunctionToPlainFunction(
  final PairFunction<T, K, V> pairFunction) {
  return new Function<T, Tuple2<K, V>>() {
    @Override
    public Tuple2<K, V> call(final T elem) throws Exception {
      return pairFunction.call(elem);
    }
  };
}
 
Example #19
Source File: Functions.java    From spark-streaming-direct-kafka with Apache License 2.0 5 votes vote down vote up
public static <T> Function<T,T> identity() {
    return new Function<T,T>() {
        @Override
        public T call(T t) {
            return t;
        }
    };
}
 
Example #20
Source File: FunctionCompiler.java    From rheem with Apache License 2.0 5 votes vote down vote up
/**
 * Create an appropriate {@link Function} for deploying the given {@link MapPartitionsDescriptor}
 * on Apache Spark's {@link JavaRDD#mapPartitions(FlatMapFunction)}.
 *
 * @param descriptor      describes the function
 * @param operator        that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 * @param operatorContext contains optimization information for the {@code operator}
 * @param inputs          that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 */
public <I, O> FlatMapFunction<Iterator<I>, O> compile(MapPartitionsDescriptor<I, O> descriptor,
                                                      SparkExecutionOperator operator,
                                                      OptimizationContext.OperatorContext operatorContext,
                                                      ChannelInstance[] inputs) {
    final java.util.function.Function<Iterable<I>, Iterable<O>> javaImplementation = descriptor.getJavaImplementation();
    if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableFunction) {
        return new ExtendedMapPartitionsFunctionAdapter<>(
                (FunctionDescriptor.ExtendedSerializableFunction<Iterable<I>, Iterable<O>>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new MapPartitionsFunctionAdapter<>(javaImplementation);
    }
}
 
Example #21
Source File: CaseWhenTest.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("CaseWhenTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row>  gradeLevelDF = sqlContext.sql(
			"SELECT CASE "
				+ "WHEN grade>=90 THEN 'A' "
				+ "WHEN grade>=80 THEN 'B' "
				+ "WHEN grade>=70 THEN 'C' "
				+ "WHEN grade>=60 THEN 'D' "
				+ "ELSE 'E' "
				+ "END gradeLevel "
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example #22
Source File: SparkCombineFn.java    From beam with Apache License 2.0 5 votes vote down vote up
static <InputT, ValueT, AccumT> WindowedAccumulator<InputT, ValueT, AccumT, ?> create(
    Function<InputT, ValueT> toValue, Type type, Comparator<BoundedWindow> windowComparator) {
  switch (type) {
    case MERGING:
      return MergingWindowedAccumulator.create(toValue, windowComparator);
    case NON_MERGING:
      return NonMergingWindowedAccumulator.create(toValue);
    case SINGLE_WINDOW:
    case EXPLODE_WINDOWS:
      return SingleWindowWindowedAccumulator.create(toValue);
    default:
      throw new IllegalArgumentException("Unknown type: " + type);
  }
}
 
Example #23
Source File: ScalaTest.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main( String[] args ){
	String inputFile = "data/dummy.txt";
	SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
	JavaSparkContext sparkContext = new JavaSparkContext(configuration);
	JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();

	long numberA = logData.filter(new Function<String,Boolean>(){
		private static final long serialVersionUID = 1L;
		public Boolean call(String s){
			return s.length() == 0;
		}
	}).count();
	sparkContext.close();
	System.out.println("Empty Lines: " + numberA);
}
 
Example #24
Source File: WordCountTransformOpEx.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }
 
Example #25
Source File: WordCountSocketJava8Ex.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example #26
Source File: MapTest.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class);

	List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun");

	JavaRDD<String> linesRDD = sc.parallelize(list);

	JavaRDD<Object> mapRDD = linesRDD.map(new Function<String, Object>() {

		@Override
		public Object call(String v1) throws Exception {
			return v1.split(",");
		}
	});

	JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() {

		@Override
		public Iterator<String> call(String t) throws Exception {
			// TODO Auto-generated method stub
			return Arrays.asList(t.split(",")).iterator();
		}
	});

	List<Object> collect = mapRDD.collect(); // Action算子 触发执行
	for (Object obj : collect) {
		System.out.println(obj);
	}

	List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行
	for (String s : collect2) {
		System.out.println(s);
	}
}
 
Example #27
Source File: JavaGaussianMixtureExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/gmm_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using GaussianMixture
    GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());

    // Save and load GaussianMixtureModel
    gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel");
    GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(),
      "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel");

    // Output the parameters of the mixture model
    for (int j = 0; j < gmm.k(); j++) {
      System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
        gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
    }
    // $example off$

    jsc.stop();
  }
 
Example #28
Source File: JavaFlumeEventCount.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: JavaFlumeEventCount <host> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  String host = args[0];
  int port = Integer.parseInt(args[1]);

  Duration batchInterval = new Duration(2000);
  SparkConf sparkConf = new SparkConf().setAppName("JavaFlumeEventCount");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, batchInterval);
  JavaReceiverInputDStream<SparkFlumeEvent> flumeStream =
    FlumeUtils.createStream(ssc, host, port);

  flumeStream.count();

  flumeStream.count().map(new Function<Long, String>() {
    @Override
    public String call(Long in) {
      return "Received " + in + " flume events.";
    }
  }).print();

  ssc.start();
  ssc.awaitTermination();
}
 
Example #29
Source File: JavaLogisticRegressionWithLBFGSExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}
 
Example #30
Source File: JavaSimpleFPGrowth.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  JavaSparkContext sc = SparkUtils.getLocalSparkContext(JavaSimpleFPGrowth.class);

  // $example on$
  JavaRDD<String> data = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/mllib/sample_fpgrowth.txt");

  JavaRDD<List<String>> transactions = data.map(
    new Function<String, List<String>>() {
      public List<String> call(String line) {
        String[] parts = line.split(" ");
        return Arrays.asList(parts);
      }
    }
  );

  FPGrowth fpg = new FPGrowth()
    .setMinSupport(0.2)
    .setNumPartitions(10);
  FPGrowthModel<String> model = fpg.run(transactions);

  for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().collect()) {
    System.out.println("[" + itemset.javaItems() + "], " + itemset.freq());
  }

  double minConfidence = 0.8;
  for (AssociationRules.Rule<String> rule
    : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) {
    System.out.println(
      rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
  }
  // $example off$

  sc.stop();
}