org.apache.spark.api.java.function.PairFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.PairFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Example #2

Source File: BlurLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0

6 votes

public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}

Example #3

Source File: UserVisitSessionAnalyzeSpark.java From BigDataPlatform with GNU General Public License v3.0

6 votes

/**
 * 获取通过筛选条件的session的访问明细数据RDD
 * @param sessionid2aggrInfoRDD
 * @param sessionid2actionRDD
 * @return
 */
private static JavaPairRDD<String, Row> getSessionid2detailRDD(
		JavaPairRDD<String, String> sessionid2aggrInfoRDD,
		JavaPairRDD<String, Row> sessionid2actionRDD) {
	JavaPairRDD<String, Row> sessionid2detailRDD = sessionid2aggrInfoRDD
			.join(sessionid2actionRDD)
			.mapToPair(new PairFunction<Tuple2<String,Tuple2<String,Row>>, String, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<String, Row> call(
						Tuple2<String, Tuple2<String, Row>> tuple) throws Exception {
					return new Tuple2<String, Row>(tuple._1, tuple._2._2);
				}

			});
	return sessionid2detailRDD;
}

Example #4

Source File: SharkQueryServiceImpl.java From searchanalytics-bigdata with MIT License

6 votes

@Override
	public void getSearchClicks(String dbName, String tbName) {
		JavaSharkContext sc = SharkEnv.
				initWithJavaSharkContext("GerSearchClicksSharkExample", "local");
//				sc.sql("drop table if exists search_clicks");
//				sc.sql("CREATE TABLE src(key INT, value STRING)");
//				sc.sql("LOAD DATA LOCAL INPATH
//				'${env:HIVE_HOME}/examples/files/in1.txt'
//				INTO TABLE src");
				JavaTableRDD rdd = sc.sql2rdd("SELECT count(*) FROM search.search_clicks");
				rdd.cache();
				System.out.println("Found "+rdd.count()+" num rows");
				JavaPairRDD<Integer, String> normalRDD = rdd.map(new
				PairFunction<Row, Integer, String>() {
				@Override
				public Tuple2<Integer, String> call(Row x) {
				return new Tuple2<Integer,String>(x.getInt("key"),
				x.getString("value"));
				}
				});
				System.out.println("Collected: "+normalRDD.collect());
	}

Example #5

Source File: Accumulator.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //日志输出级别
    javaSparkContext.setLogLevel("ERROR");
    //创建RDD
    JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache();

    AttackAccumulator attackAccumulator = new AttackAccumulator();
    //注册累加器
    javaSparkContext.sc().register(attackAccumulator, "attack_count");
    //生成一个随机数作为value
    JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> {
        Integer random = new Random().nextInt(10);
        return new Tuple2<>(s, s + ":" + random);
    });

    javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> {
        attackAccumulator.add(tuple2._2);
    });
    System.out.println(attackAccumulator.value());
}

Example #6

Source File: NGlobalDictionaryV2Test.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}

Example #7

Source File: MmtfReader.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Reads the specified fraction [0,1] of randomly selected PDB entries from a Hadoop Sequence file.
 * 
 * @param path Path to Hadoop sequence file
 * @param fraction Fraction of entries to be read [0,1]
 * @param seed Seed for random number generator
 * @param sc Spark context
 * @return structure data as keyword/value pairs
 */
public static JavaPairRDD<String, StructureDataInterface> readSequenceFile(String path, double fraction, long seed, JavaSparkContext sc) {
	return sc
			.sequenceFile(path, Text.class, BytesWritable.class)
			.sample(false, fraction, seed)
			.mapToPair(new PairFunction<Tuple2<Text, BytesWritable>,String, StructureDataInterface>() {
				private static final long serialVersionUID = 3512575873287789314L;

				public Tuple2<String, StructureDataInterface> call(Tuple2<Text, BytesWritable> t) throws Exception {
					byte[] values = t._2.copyBytes();
					// if data are gzipped, unzip them first
					try {
					    values = ReaderUtils.deflateGzip(t._2.copyBytes()); // unzip binary MessagePack data
					} catch (ZipException e) {}
					
					// deserialize message pack
					MmtfStructure mmtf = new MessagePackSerialization().deserialize(new ByteArrayInputStream(values)); // deserialize message pack
					
					// decode message pack
					return new Tuple2<String, StructureDataInterface>(t._1.toString(), new GenericDecoder(mmtf)); // decode message pack
				}
			});
}

Example #8

Source File: MmtfReader.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Reads an MMTF-Hadoop Sequence file. The Hadoop Sequence file may contain
 * either gzip compressed or uncompressed values.
 * See <a href="https://mmtf.rcsb.org/download.html"> for file download information</a>
 * 
 * @param path Path to Hadoop sequence file
 * @param sc Spark context
 * @return structure data as keyword/value pairs
 */
public static JavaPairRDD<String, StructureDataInterface> readSequenceFile(String path, JavaSparkContext sc) {
	return sc
			.sequenceFile(path, Text.class, BytesWritable.class)
			.mapToPair(new PairFunction<Tuple2<Text, BytesWritable>,String, StructureDataInterface>() {
				private static final long serialVersionUID = 3512575873287789314L;

				public Tuple2<String, StructureDataInterface> call(Tuple2<Text, BytesWritable> t) throws Exception {
					byte[] values = t._2.copyBytes();
					
					// if data are gzipped, unzip them first
					try {
					    values = ReaderUtils.deflateGzip(t._2.copyBytes());
					} catch (ZipException e) {}
					
					// deserialize message pack
					MmtfStructure mmtf = new MessagePackSerialization().deserialize(new ByteArrayInputStream(values)); 
					
					// decode message pack
					return new Tuple2<String, StructureDataInterface>(t._1.toString(), new GenericDecoder(mmtf)); 
				}
			});
}

Example #9

Source File: HalfHalfTransformer.java From datacollector with Apache License 2.0

5 votes

@Override
public TransformResult transform(JavaRDD<Record> javaRDD) {
  List<Record> allRecords = javaRDD.collect();
  JavaRDD<Record> result = jsc.parallelize(allRecords.subList(0, allRecords.size() / 2));
  JavaPairRDD<Record, String> errors = jsc.parallelize(allRecords.subList(allRecords.size() / 2, allRecords.size()))
      .mapToPair(new PairFunction<Record, Record, String>() {
        @Override
        public Tuple2<Record, String> call(Record record) throws Exception {
          return new Tuple2<>(record, ERROR_STRING);
        }
      });
  return new TransformResult(result, errors);
}

Example #10

Source File: ExtractorEntityTest.java From deep-spark with Apache License 2.0

5 votes

/**
 * Prepares player data to use in join tests. Maps players to pair with teamId as key.
 *
 * @param context the deep context
 */
protected JavaPairRDD<Long, PlayerEntity> preparePlayerRDD(DeepSparkContext context) {
    ExtractorConfig<PlayerEntity> playersConfigEntity = getReadExtractorConfig(this.databaseExtractorName,
            FOOTBALL_PLAYER_INPUT, PlayerEntity.class);

    return context.createJavaRDD(playersConfigEntity)
            .mapToPair
                    (new PairFunction<PlayerEntity, Long, PlayerEntity>() {
                        @Override
                        public Tuple2<Long, PlayerEntity> call(PlayerEntity playerEntity)
                                throws Exception {
                            return new Tuple2<>(playerEntity.getTeamId(), playerEntity);
                        }
                    });
}

Example #11

Source File: JavaTC.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}

Example #12

Source File: JavaCustomReceiver.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}

Example #13

Source File: ExtractorEntityTest.java From deep-spark with Apache License 2.0

5 votes

/**
 * Prepares team data to use in join tests. Maps teams to pair with id as key.
 *
 * @param context the deep context
 */
protected JavaPairRDD<Long, TeamEntity> prepareTeamRDD(DeepSparkContext context) {
    ExtractorConfig<TeamEntity> teamsConfigEntity = getReadExtractorConfig(this.databaseExtractorName,
            FOOTBALL_TEAM_INPUT, TeamEntity.class);

    return context.createJavaRDD(teamsConfigEntity)
            .mapToPair
                    (new PairFunction<TeamEntity, Long, TeamEntity>() {
                        @Override
                        public Tuple2<Long, TeamEntity> call(TeamEntity teamEntity)
                                throws Exception {
                            return new Tuple2<>(teamEntity.getId(), teamEntity);
                        }
                    });
}

Example #14

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中，查询用户访问行为数据
	// 第一个限定：click_product_id，限定为不为空的访问行为，那么就代表着点击行为
	// 第二个限定：在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}

Example #15

Source File: OnlyErrorTransformer.java From datacollector with Apache License 2.0

5 votes

@Override
public TransformResult transform(JavaRDD<Record> javaRDD) {
  JavaPairRDD<Record, String> errors = javaRDD.mapToPair(new PairFunction<Record, Record, String>() {
    @Override
    public Tuple2<Record, String> call(Record record) throws Exception {
      return new Tuple2<>(record, record.get(ERROR_PATH).getValueAsString());
    }
  });

  JavaRDD<Record> result = null;
  return new TransformResult(result, errors);
}

Example #16

Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}

Example #17

Source File: CoderHelpers.java From beam with Apache License 2.0

5 votes

/**
 * A function wrapper for converting a byte array pair to a key-value pair, where values are
 * {@link Iterable}.
 *
 * @param keyCoder Coder to deserialize keys.
 * @param valueCoder Coder to deserialize values.
 * @param <K> The type of the key being deserialized.
 * @param <V> The type of the value being deserialized.
 * @return A function that accepts a pair of byte arrays and returns a key-value pair.
 */
public static <K, V>
    PairFunction<Tuple2<ByteArray, Iterable<byte[]>>, K, Iterable<V>> fromByteFunctionIterable(
        final Coder<K> keyCoder, final Coder<V> valueCoder) {
  return tuple ->
      new Tuple2<>(
          fromByteArray(tuple._1().getValue(), keyCoder),
          StreamSupport.stream(tuple._2().spliterator(), false)
              .map(bytes -> fromByteArray(bytes, valueCoder))
              .collect(Collectors.toList()));
}

Example #18

Source File: TranslationUtils.java From beam with Apache License 2.0

5 votes

/**
 * Returns a pair function to convert bytes to value via coder.
 *
 * @param coderMap - mapping between TupleTag and a coder
 * @return a pair function to convert bytes to value via coder
 */
public static PairFunction<
        Tuple2<TupleTag<?>, ValueAndCoderLazySerializable<WindowedValue<?>>>,
        TupleTag<?>,
        WindowedValue<?>>
    getTupleTagDecodeFunction(final Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap) {
  return tuple2 -> {
    TupleTag<?> tupleTag = tuple2._1;
    ValueAndCoderLazySerializable<WindowedValue<?>> windowedByteValue = tuple2._2;
    return new Tuple2<>(tupleTag, windowedByteValue.getOrDecode(coderMap.get(tupleTag)));
  };
}

Example #19

Source File: TranslationUtils.java From beam with Apache License 2.0

5 votes

/**
 * Returns a pair function to convert value to bytes via coder.
 *
 * @param coderMap - mapping between TupleTag and a coder
 * @return a pair function to convert value to bytes via coder
 */
public static PairFunction<
        Tuple2<TupleTag<?>, WindowedValue<?>>,
        TupleTag<?>,
        ValueAndCoderLazySerializable<WindowedValue<?>>>
    getTupleTagEncodeFunction(final Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap) {
  return tuple2 -> {
    TupleTag<?> tupleTag = tuple2._1;
    WindowedValue<?> windowedValue = tuple2._2;
    return new Tuple2<>(
        tupleTag, ValueAndCoderLazySerializable.of(windowedValue, coderMap.get(tupleTag)));
  };
}

Example #20

Source File: UpsertPartitioner.java From hudi with Apache License 2.0

5 votes

private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, JavaSparkContext jsc) {

    Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
    if (partitionPaths != null && partitionPaths.size() > 0) {
      JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
      partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
          partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
    }

    return partitionSmallFilesMap;
  }

Example #21

Source File: TranslationUtils.java From beam with Apache License 2.0

5 votes

/**
 * A utility method that adapts {@link PairFunction} to a {@link PairFlatMapFunction} with an
 * {@link Iterator} input. This is particularly useful because it allows to use functions written
 * for mapToPair functions in flatmapToPair functions.
 *
 * @param pairFunction the {@link PairFunction} to adapt.
 * @param <T> the input type.
 * @param <K> the output key type.
 * @param <V> the output value type.
 * @return a {@link PairFlatMapFunction} that accepts an {@link Iterator} as an input and applies
 *     the {@link PairFunction} on every element.
 */
public static <T, K, V> PairFlatMapFunction<Iterator<T>, K, V> pairFunctionToPairFlatMapFunction(
    final PairFunction<T, K, V> pairFunction) {
  return itr ->
      Iterators.transform(
          itr,
          t -> {
            try {
              return pairFunction.call(t);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          });
}

Example #22

Source File: TranslationUtils.java From beam with Apache License 2.0

5 votes

/** Extract key from a {@link WindowedValue} {@link KV} into a pair. */
public static <K, V>
    PairFunction<WindowedValue<KV<K, V>>, ByteArray, WindowedValue<KV<K, V>>>
        toPairByKeyInWindowedValue(final Coder<K> keyCoder) {
  return windowedKv ->
      new Tuple2<>(
          new ByteArray(CoderHelpers.toByteArray(windowedKv.getValue().getKey(), keyCoder)),
          windowedKv);
}

Example #23

Source File: CountLines.java From examples with Apache License 2.0

5 votes

@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}

Example #24

Source File: JavaLogQuery.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}

Example #25

Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}

Example #26

Source File: WordCountTransformOpEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }

Example #27

Source File: SparkUtils.java From deeplearning4j with Apache License 2.0

5 votes

public static <T> JavaPairRDD<Integer, T> indexedRDD(JavaRDD<T> rdd) {
    return rdd.zipWithIndex().mapToPair(new PairFunction<Tuple2<T, Long>, Integer, T>() {
        @Override
        public Tuple2<Integer, T> call(Tuple2<T, Long> elemIdx) {
            return new Tuple2<>(elemIdx._2().intValue(), elemIdx._1());
        }
    });
}

Example #28

Source File: SparkFrontendUtils.java From incubator-nemo with Apache License 2.0

5 votes

/**
 * Converts a {@link PairFunction} to a plain map {@link Function}.
 *
 * @param pairFunction the pair function to convert.
 * @param <T>          the type of original element.
 * @param <K>          the type of converted key.
 * @param <V>          the type of converted value.
 * @return the converted map function.
 */
public static <T, K, V> Function<T, Tuple2<K, V>> pairFunctionToPlainFunction(
  final PairFunction<T, K, V> pairFunction) {
  return new Function<T, Tuple2<K, V>>() {
    @Override
    public Tuple2<K, V> call(final T elem) throws Exception {
      return pairFunction.call(elem);
    }
  };
}

Example #29

Source File: JoinParirRDD.java From sparkResearch with Apache License 2.0

5 votes

public static void run(JavaSparkContext sparkContext){
    JavaRDD<String> rdd = sparkContext.parallelize(Arrays.asList("test", "java", "python"));
    JavaRDD<String> otherRDD = sparkContext.parallelize(Arrays.asList("golang", "php", "hadoop"));

    PairFunction<String, String, String> pairFunction = new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            return new Tuple2<>(s.split(" ")[0], s);
        }
    };
    JavaPairRDD<String, String> pairRDD = rdd.mapToPair(pairFunction);
    JavaPairRDD<String, String> pairRDDOther = otherRDD.mapToPair(pairFunction);

    pairRDD.sortByKey(false);
}

Example #30

Source File: PageOneStepConvertRateSpark.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
 * 获取<sessionid,用户访问行为>格式的数据
 * @param actionRDD 用户访问行为RDD
 * @return <sessionid,用户访问行为>格式的数据
 */
private static JavaPairRDD<String, Row> getSessionid2actionRDD(
		JavaRDD<Row> actionRDD) {
	return actionRDD.mapToPair(new PairFunction<Row, String, Row>() {

		private  final Long serialVersionUID = 1L;

		@Override
		public Tuple2<String, Row> call(Row row) throws Exception {
			String sessionid = row.getString(2);
			return new Tuple2<String, Row>(sessionid, row);   
		}
		
	});
}