org.apache.spark.api.java.function.Function2 Java Examples

The following examples show how to use org.apache.spark.api.java.function.Function2. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Example #2

Source File: Reduce.java From SparkDemo with MIT License

6 votes

private static void reduce(JavaSparkContext sc) {
	
	List<Integer> numberList=Arrays.asList(1,2,3,4,5,6,7,8,9,10);
	JavaRDD<Integer> javaRDD = sc.parallelize(numberList);
	
	/**
	 *   =====================================================
	 *   |                                                                 累加求和                                                               | 
	 *   =====================================================
	 */
	Integer num = javaRDD.reduce(new Function2<Integer, Integer, Integer>() {
		/**
		 * @param num1上一次计算结果 return的值
		 * @param num2 当前值
		 */
		@Override
		public Integer call(Integer num1, Integer num2) throws Exception {
			// System.out.println(num1+"======"+num2);
			return num1 + num2;
		}
	});
	
	System.out.println(num);
	
	sc.close();
}

Example #3

Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
    final ByteArray ONE = new ByteArray();
    Long count = rdd.mapValues(new Function<Object[], Long>() {
        @Override
        public Long call(Object[] objects) throws Exception {
            return (Long) objects[countMeasureIndex];
        }
    }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
        @Override
        public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22)
                throws Exception {
            return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
        }
    })._2();
    return count;
}

Example #4

Source File: NGlobalDictionaryV2Test.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}

Example #5

Source File: BlurBulkLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0

6 votes

@Override
protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() {
  return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() {
    // Blur Thrift Client
    @Override
    public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception {
      Iface client = getBlurClient();
      for (Tuple2<String, RowMutation> tuple : rdd.collect()) {
        if (tuple != null) {
          try {
            RowMutation rm = tuple._2;
            // Index using enqueue mutate call
            client.enqueueMutate(rm);
          } catch (Exception ex) {
            LOG.error("Unknown error while trying to call enqueueMutate.", ex);
            throw ex;
          }
        }
      }
      return null;
    }
  };
}

Example #6

Source File: ReduceTransform.java From incubator-nemo with Apache License 2.0

6 votes

/**
 * Reduce the iterator elements into a single object.
 *
 * @param elements the iterator of elements.
 * @param func     function to apply for reduction.
 * @param <T>      type of the elements.
 * @return the reduced element.
 */
@Nullable
public static <T> T reduceIterator(final Iterator<T> elements, final Function2<T, T, T> func) {
  if (!elements.hasNext()) { // nothing to be done
    return null;
  }

  T res = elements.next();
  while (elements.hasNext()) {
    try {
      res = func.call(res, elements.next());
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  return res;
}

Example #7

Source File: SparkCubingByLayer.java From kylin with Apache License 2.0

6 votes

private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
    final ByteArray ONE = new ByteArray();
    Long count = rdd.mapValues(new Function<Object[], Long>() {
        @Override
        public Long call(Object[] objects) throws Exception {
            return (Long) objects[countMeasureIndex];
        }
    }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
        @Override
        public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22)
                throws Exception {
            return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
        }
    })._2();
    return count;
}

Example #8

Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0

6 votes

/**
 * Counts the number of non-space characters in this data set. This utility method 
 * is used to check the tokenization result.
 * @param lines
 * @return number of characters
 */
int numCharacters(JavaRDD<String> lines) {
	JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
		private static final long serialVersionUID = -2189399343462982586L;
		@Override
		public Integer call(String line) throws Exception {
			line = line.replaceAll("[\\s_]+", "");
			return line.length();
		}
	});
	return lengths.reduce(new Function2<Integer, Integer, Integer>() {
		private static final long serialVersionUID = -8438072946884289401L;

		@Override
		public Integer call(Integer e0, Integer e1) throws Exception {
			return e0 + e1;
		}
	});
}

Example #9

Source File: ReduceTransform.java From nemo with Apache License 2.0

6 votes

/**
 * Reduce the iterator elements into a single object.
 * @param elements the iterator of elements.
 * @param func function to apply for reduction.
 * @param <T> type of the elements.
 * @return the reduced element.
 */
@Nullable
public static <T> T reduceIterator(final Iterator<T> elements, final Function2<T, T, T> func) {
  if (!elements.hasNext()) { // nothing to be done
    return null;
  }

  T res = elements.next();
  while (elements.hasNext()) {
    try {
      res = func.call(res, elements.next());
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  return res;
}

Example #10

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final Partitioner partitioner,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #11

Source File: MLSupporter.java From DDF with Apache License 2.0

5 votes

@Override
public long[][] getConfusionMatrix(IModel model, double threshold) throws DDFException {
  SparkDDF ddf = (SparkDDF) this.getDDF();
  SparkDDF predictions = (SparkDDF) ddf.ML.applyModel(model, true, false);

  // Now get the underlying RDD to compute
  JavaRDD<double[]> yTrueYPred = (JavaRDD<double[]>) predictions.getJavaRDD(double[].class);
  final double threshold1 = threshold;
  long[] cm = yTrueYPred.map(new Function<double[], long[]>() {
    @Override
    public long[] call(double[] params) {
      byte isPos = toByte(params[0] > threshold1);
      byte predPos = toByte(params[1] > threshold1);

      long[] result = new long[] { 0L, 0L, 0L, 0L };
      result[isPos << 1 | predPos] = 1L;
      return result;
    }
  }).reduce(new Function2<long[], long[], long[]>() {
    @Override
    public long[] call(long[] a, long[] b) {
      return new long[] { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] };
    }
  });

  return new long[][] { new long[] { cm[3], cm[2] }, new long[] { cm[1], cm[0] } };
}

Example #12

Source File: FunctionCompiler.java From rheem with Apache License 2.0

5 votes

/**
 * Create an appropriate {@link Function} for deploying the given {@link ReduceDescriptor}
 * on Apache Spark.
 */
public <T> Function2<T, T, T> compile(ReduceDescriptor<T> descriptor,
                                      SparkExecutionOperator operator,
                                      OptimizationContext.OperatorContext operatorContext,
                                      ChannelInstance[] inputs) {
    final BinaryOperator<T> javaImplementation = descriptor.getJavaImplementation();
    if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableBinaryOperator) {
        return new ExtendedBinaryOperatorAdapter<>(
                (FunctionDescriptor.ExtendedSerializableBinaryOperator<T>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new BinaryOperatorAdapter<>(javaImplementation);
    }
}

Example #13

Source File: CountLines.java From examples with Apache License 2.0

5 votes

@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}

Example #14

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public SparkJavaPairRDD<K, V> reduceByKey(final Function2<V, V, V> func) {
  // Explicit conversion
  final PairRDDFunctions<K, V> pairRdd = RDD.rddToPairRDDFunctions(
    rdd, ClassTag$.MODULE$.apply(Object.class), ClassTag$.MODULE$.apply(Object.class), null);
  final RDD<Tuple2<K, V>> reducedRdd = pairRdd.reduceByKey(func);
  return SparkJavaPairRDD.fromRDD(reducedRdd);
}

Example #15

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final int numPartitions) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #16

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #17

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner,
                                               final boolean mapSideCombine,
                                               final Serializer serializer) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #18

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final int numPartitions,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Example #19

Source File: JavaPairRDD.java From nemo with Apache License 2.0

5 votes

@Override
public JavaPairRDD<K, V> reduceByKey(final Function2<V, V, V> func) {
  final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(dag);

  final IRVertex reduceByKeyVertex = new OperatorVertex(new ReduceByKeyTransform<K, V>(func));
  builder.addVertex(reduceByKeyVertex, loopVertexStack);

  final IREdge newEdge = new IREdge(getEdgeCommunicationPattern(lastVertex, reduceByKeyVertex),
      lastVertex, reduceByKeyVertex, new SparkCoder(serializer));
  newEdge.setProperty(KeyExtractorProperty.of(new SparkKeyExtractor()));
  builder.connectVertices(newEdge);

  return new JavaPairRDD<>(this.sparkContext, builder.buildWithoutSourceSinkCheck(), reduceByKeyVertex);
}

Example #20

Source File: JavaLogQuery.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}

Example #21

Source File: JavaCustomReceiver.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}

Example #22

Source File: MapPartitionsWithIndex.java From SparkDemo with MIT License

5 votes

private static void mapPartitionsWithIndex(JavaSparkContext sc) {

		List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

		// 初始化，分为3个分区
		JavaRDD<String> namesRDD = sc.parallelize(names, 3);
		JavaRDD<String> mapPartitionsWithIndexRDD = namesRDD
				.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

					private static final long serialVersionUID = 1L;

					public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
						List<String> list = new ArrayList<String>();
						while (v2.hasNext()) {
							list.add("分区索引:" + v1 + "\t" + v2.next());
						}
						return list.iterator();
					}
				}, true);

		// 从集群获取数据到本地内存中
		List<String> result = mapPartitionsWithIndexRDD.collect();
		for (String s : result) {
			System.out.println(s);
		}

		sc.close();
	}

Example #23

Source File: Functions.java From spark-streaming-direct-kafka with Apache License 2.0

5 votes

/**
 * @return a function that returns the second of two values
 * @param <T> element type
 */
public static <T> Function2<T,T,T> last() {
    return new Function2<T,T,T>() {
        @Override
        public T call(T current, T next) {
            return next;
        }
    };
}

Example #24

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoVertex, String, Boolean> parseVertexProperty() {
    return parseVertexProperty;
}

Example #25

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

public MizoBuilder parseInEdge(Function2<MizoVertex, String, Boolean> predicate) {
    this.parseInEdge = predicate;

    return this;
}

Example #26

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoVertex, String, Boolean> parseOutEdge() {
    return parseOutEdge;
}

Example #27

Source File: CoverageModelEMWorkspace.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * A generic function for broadcasting an object to all compute blocks
 *
 * If Spark is enabled:
 *
 *      A {@link Broadcast} will be created from {@param obj} and will be "received" by the compute nodes by calling
 *      {@param pusher}. A reference to the updated RDD will replace the old RDD.
 *
 * If Spark is disabled:
 *
 *      The {@param pusher} function will be called together with {@param obj} and {@link #localComputeBlock}
 *
 * @param obj te object to broadcast
 * @param pusher a map from (V, {@link CoverageModelEMComputeBlock}) -> {@link CoverageModelEMComputeBlock} that
 *               updates the compute block with the broadcasted value
 * @param <V> the type of the broadcasted object
 */
@UpdatesRDD
private <V> void pushToWorkers(@Nonnull final V obj,
                               @Nonnull final Function2<V, CoverageModelEMComputeBlock, CoverageModelEMComputeBlock> pusher) {
    if (sparkContextIsAvailable) {
        final Broadcast<V> broadcastedObj = ctx.broadcast(obj);
        final Function<CoverageModelEMComputeBlock, CoverageModelEMComputeBlock> mapper =
                cb -> pusher.call(broadcastedObj.value(), cb);
        mapWorkers(mapper);
    } else {
        try {
            localComputeBlock = pusher.call(obj, localComputeBlock);
        } catch (final Exception ex) {
            throw new RuntimeException("Can not apply the map function to the local compute block", ex);
        }
    }
}

Example #28

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoVertex, String, Boolean> parseInEdge() {
    return parseInEdge;
}

Example #29

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoEdge, String, Boolean> parseEdgeProperty() {
    return parseEdgeProperty;
}

Example #30

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

public MizoBuilder parseOutEdge(Function2<MizoVertex, String, Boolean> predicate) {
    this.parseOutEdge = predicate;

    return this;
}