Java Code Examples for org.apache.spark.api.java.JavaRDD#reduce()

The following examples show how to use org.apache.spark.api.java.JavaRDD#reduce() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Reduce.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void reduce(JavaSparkContext sc) {
	
	List<Integer> numberList=Arrays.asList(1,2,3,4,5,6,7,8,9,10);
	JavaRDD<Integer> javaRDD = sc.parallelize(numberList);
	
	/**
	 *   =====================================================
	 *   |                                                                 累加求和                                                               | 
	 *   =====================================================
	 */
	Integer num = javaRDD.reduce(new Function2<Integer, Integer, Integer>() {
		/**
		 * @param num1上一次计算结果 return的值
		 * @param num2 当前值
		 */
		@Override
		public Integer call(Integer num1, Integer num2) throws Exception {
			// System.out.println(num1+"======"+num2);
			return num1 + num2;
		}
	});
	
	System.out.println(num);
	
	sc.close();
}
 
Example 2
Source File: Tokenizer.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Counts the number of non-space characters in this data set. This utility method 
 * is used to check the tokenization result.
 * @param lines
 * @return number of characters
 */
int numCharacters(JavaRDD<String> lines) {
	JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
		private static final long serialVersionUID = -2189399343462982586L;
		@Override
		public Integer call(String line) throws Exception {
			line = line.replaceAll("[\\s_]+", "");
			return line.length();
		}
	});
	return lengths.reduce(new Function2<Integer, Integer, Integer>() {
		private static final long serialVersionUID = -8438072946884289401L;

		@Override
		public Integer call(Integer e0, Integer e1) throws Exception {
			return e0 + e1;
		}
	});
}
 
Example 3
Source File: PiApproximation.java    From tutorials with MIT License 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("BaeldungPIApproximation").setMaster("local[2]");
    JavaSparkContext context = new JavaSparkContext(conf);
    int slices = args.length >= 1 ? Integer.valueOf(args[0]) : 2;
    int n = (100000L * slices) > Integer.MAX_VALUE ? Integer.MAX_VALUE : 100000 * slices;

    List<Integer> xs = IntStream.rangeClosed(0, n)
      .mapToObj(element -> Integer.valueOf(element))
      .collect(Collectors.toList());

    JavaRDD<Integer> dataSet = context.parallelize(xs, slices);

    JavaRDD<Integer> pointsInsideTheCircle = dataSet.map(integer -> {
       double x = Math.random() * 2 - 1;
       double y = Math.random() * 2 - 1;
       return (x*x + y*y ) < 1 ? 1: 0;
    });

    int count = pointsInsideTheCircle.reduce((integer, integer2) -> integer + integer2);

    System.out.println("The pi was estimated as:" + count / n);

    context.stop();
}
 
Example 4
Source File: SparkDataValidation.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected static ValidationResult validateDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                            int[] featuresShape, int[] labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateDataSetFn(deleteInvalid, featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}
 
Example 5
Source File: SparkDataValidation.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected static ValidationResult validateMultiDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                                 int numFeatureArrays, int numLabelArrays,
                                                 List<int[]> featuresShape, List<int[]> labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateMultiDataSetFn(deleteInvalid, numFeatureArrays, numLabelArrays,
            featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}
 
Example 6
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing
 * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)}
 * or one of the similar methods
 *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions(
                    new ScoreFlatMapFunction(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize));

    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}
 
Example 7
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing
 * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)}
 * or one of the similar methods
 *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGDataSet(conf.toJson(),
                    sc.broadcast(network.params()), minibatchSize));

    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}
 
Example 8
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing
 * or averaging over the entire data set.
 *      *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(),
                    sc.broadcast(network.params()), minibatchSize));
    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}
 
Example 9
Source File: Tokenizer.java    From vn.vitk with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Tokenizes a RDD of text lines and return a RDD of result.
 * @param input
 * @return a RDD of tokenized text lines.
 */
public JavaRDD<String> tokenize(JavaRDD<String> input) {
	if (verbose) {
		// print some basic statistic about the input, including 
		// max line length, min line length, average line length in syllables
		JavaRDD<Integer> wordCount = input.map(new Function<String, Integer>() {
			private static final long serialVersionUID = 7214093453452927565L;
			@Override
			public Integer call(String line) throws Exception {
				return line.split("\\s+").length;
			}
			
		});
		Comparator<Integer> comp = new IntegerComparator();
		System.out.println("Max line length (in syllables) = " + wordCount.max(comp));
		System.out.println("Min line length (in syllables) = " + wordCount.min(comp));
		float totalCount = wordCount.reduce(new Function2<Integer, Integer, Integer>() {
			private static final long serialVersionUID = 1L;
			@Override
			public Integer call(Integer v1, Integer v2) throws Exception {
				return v1 + v2;
			}
		});
		System.out.println("Avg line length (in syllables) = " + (totalCount) / input.count());
	}
	
	JavaRDD<String> output = null;
	if (classifier == null) {
		// use phrase graph approach (shortest paths and bigram model)
		// to segment phrases
		output = input.map(new SegmentationFunction());
	} else {
		// use logistic regression approach to segment phrases
		JavaRDD<String> s = input.map(new SegmentationFunction());
		// make sure that the preceding lazy computation has been evaluated
		// so that whitespace contexts have been properly accumulated
		System.out.println("Number of text lines = " + s.count());
		System.out.println("Number of contexts = " + contexts.value().size());
		// use whitespace classification approach (logistic regresion model)
		JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts.value());
		DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class);
		DataFrame df1 = model.transform(df0);
		prediction = jsc.broadcast(df1.select("prediction").collect());
		if (df1.count() > 0) {
			output = s.map(new WhitespaceClassificationFunction());
		}
		else { 
			System.err.println("Empty data frame!");
		}
	}
	if (verbose) {
		// print number of non-space characters of the input and output dataset
		System.out.println("#(non-space characters of input) = " + numCharacters(input));
		if (output != null) {
			System.out.println("#(non-space characters of output) = " + numCharacters(output));
		}
	}
	return output;
}
 
Example 10
Source File: DPMeansClusterer.java    From ensemble-clustering with MIT License 3 votes vote down vote up
private Map<String, Instance> initKMeans(SparkDataSet ds) {		
	JavaRDD<Map<String, Instance>> singletons = ds.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		
	Map<String, Instance> kmeans = singletons.reduce( new AggregateClusterFunction(distFunc, Double.MAX_VALUE) );
	
	return kmeans;
}
 
Example 11
Source File: ThresholdClusterer.java    From ensemble-clustering with MIT License 2 votes vote down vote up
@Override
	public SparkClusterResult doCluster(DataSet ds) {
		// SparkDataSet needs to be passed in
		SparkDataSet rdd = (SparkDataSet)ds;
		
		// cache dataset in memory
//		rdd.getRDD().cache();
		
		distFunc = new DistanceFunction(this.typeDefs);
		ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate);
		
		log.info("Starting threshold clusterer with threshold {}", threshold);
		
		// TODO look at using a reduce function 
		// Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster"
		// second step is a reduce where input is a List<Instances> and produces a List<Instances>
		// this step would merge clusters within threshold
		
		JavaPairRDD<String, Instance> instances = rdd.getRDD();
		instances.cache();
		
		// convert each instance into a singleton cluster
		JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		//singletons.cache();
		
		log.info("Generated initial singleton clusters");
		
		// merge clusters together
		Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) );
		
		log.info("Merging clusters completed with {} clusters", clusters.size());
		
		// find the best cluster for each instance
		JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) );
		
		log.info("Output results");
		
		if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath);
	
		if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath);
		
		log.info("Threshold clusterer completed");
		
		// return the cluster membership rdd
		return new SparkClusterResult(bestCluster);
	}