Java Code Examples for org.apache.spark.api.java.function.PairFunction

The following examples show how to use org.apache.spark.api.java.function.PairFunction. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
/**
 * 获取通过筛选条件的session的访问明细数据RDD
 * @param sessionid2aggrInfoRDD
 * @param sessionid2actionRDD
 * @return
 */
private static JavaPairRDD<String, Row> getSessionid2detailRDD(
		JavaPairRDD<String, String> sessionid2aggrInfoRDD,
		JavaPairRDD<String, Row> sessionid2actionRDD) {
	JavaPairRDD<String, Row> sessionid2detailRDD = sessionid2aggrInfoRDD
			.join(sessionid2actionRDD)
			.mapToPair(new PairFunction<Tuple2<String,Tuple2<String,Row>>, String, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<String, Row> call(
						Tuple2<String, Tuple2<String, Row>> tuple) throws Exception {
					return new Tuple2<String, Row>(tuple._1, tuple._2._2);
				}

			});
	return sessionid2detailRDD;
}
 
Example 2
Source Project: sparkResearch   Source File: KafkaStreaming.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 3
Source Project: sparkResearch   Source File: Accumulator.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //日志输出级别
    javaSparkContext.setLogLevel("ERROR");
    //创建RDD
    JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache();

    AttackAccumulator attackAccumulator = new AttackAccumulator();
    //注册累加器
    javaSparkContext.sc().register(attackAccumulator, "attack_count");
    //生成一个随机数作为value
    JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> {
        Integer random = new Random().nextInt(10);
        return new Tuple2<>(s, s + ":" + random);
    });

    javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> {
        attackAccumulator.add(tuple2._2);
    });
    System.out.println(attackAccumulator.value());
}
 
Example 4
private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}
 
Example 5
Source Project: mmtf-spark   Source File: MmtfReader.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Reads an MMTF-Hadoop Sequence file. The Hadoop Sequence file may contain
 * either gzip compressed or uncompressed values.
 * See <a href="https://mmtf.rcsb.org/download.html"> for file download information</a>
 * 
 * @param path Path to Hadoop sequence file
 * @param sc Spark context
 * @return structure data as keyword/value pairs
 */
public static JavaPairRDD<String, StructureDataInterface> readSequenceFile(String path, JavaSparkContext sc) {
	return sc
			.sequenceFile(path, Text.class, BytesWritable.class)
			.mapToPair(new PairFunction<Tuple2<Text, BytesWritable>,String, StructureDataInterface>() {
				private static final long serialVersionUID = 3512575873287789314L;

				public Tuple2<String, StructureDataInterface> call(Tuple2<Text, BytesWritable> t) throws Exception {
					byte[] values = t._2.copyBytes();
					
					// if data are gzipped, unzip them first
					try {
					    values = ReaderUtils.deflateGzip(t._2.copyBytes());
					} catch (ZipException e) {}
					
					// deserialize message pack
					MmtfStructure mmtf = new MessagePackSerialization().deserialize(new ByteArrayInputStream(values)); 
					
					// decode message pack
					return new Tuple2<String, StructureDataInterface>(t._1.toString(), new GenericDecoder(mmtf)); 
				}
			});
}
 
Example 6
Source Project: mmtf-spark   Source File: MmtfReader.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Reads the specified fraction [0,1] of randomly selected PDB entries from a Hadoop Sequence file.
 * 
 * @param path Path to Hadoop sequence file
 * @param fraction Fraction of entries to be read [0,1]
 * @param seed Seed for random number generator
 * @param sc Spark context
 * @return structure data as keyword/value pairs
 */
public static JavaPairRDD<String, StructureDataInterface> readSequenceFile(String path, double fraction, long seed, JavaSparkContext sc) {
	return sc
			.sequenceFile(path, Text.class, BytesWritable.class)
			.sample(false, fraction, seed)
			.mapToPair(new PairFunction<Tuple2<Text, BytesWritable>,String, StructureDataInterface>() {
				private static final long serialVersionUID = 3512575873287789314L;

				public Tuple2<String, StructureDataInterface> call(Tuple2<Text, BytesWritable> t) throws Exception {
					byte[] values = t._2.copyBytes();
					// if data are gzipped, unzip them first
					try {
					    values = ReaderUtils.deflateGzip(t._2.copyBytes()); // unzip binary MessagePack data
					} catch (ZipException e) {}
					
					// deserialize message pack
					MmtfStructure mmtf = new MessagePackSerialization().deserialize(new ByteArrayInputStream(values)); // deserialize message pack
					
					// decode message pack
					return new Tuple2<String, StructureDataInterface>(t._1.toString(), new GenericDecoder(mmtf)); // decode message pack
				}
			});
}
 
Example 7
public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}
 
Example 8
Source Project: searchanalytics-bigdata   Source File: SharkQueryServiceImpl.java    License: MIT License 6 votes vote down vote up
@Override
	public void getSearchClicks(String dbName, String tbName) {
		JavaSharkContext sc = SharkEnv.
				initWithJavaSharkContext("GerSearchClicksSharkExample", "local");
//				sc.sql("drop table if exists search_clicks");
//				sc.sql("CREATE TABLE src(key INT, value STRING)");
//				sc.sql("LOAD DATA LOCAL INPATH
//				'${env:HIVE_HOME}/examples/files/in1.txt'
//				INTO TABLE src");
				JavaTableRDD rdd = sc.sql2rdd("SELECT count(*) FROM search.search_clicks");
				rdd.cache();
				System.out.println("Found "+rdd.count()+" num rows");
				JavaPairRDD<Integer, String> normalRDD = rdd.map(new
				PairFunction<Row, Integer, String>() {
				@Override
				public Tuple2<Integer, String> call(Row x) {
				return new Tuple2<Integer,String>(x.getInt("key"),
				x.getString("value"));
				}
				});
				System.out.println("Collected: "+normalRDD.collect());
	}
 
Example 9
/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中,查询用户访问行为数据
	// 第一个限定:click_product_id,限定为不为空的访问行为,那么就代表着点击行为
	// 第二个限定:在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}
 
Example 10
/**
 * 获取<sessionid,用户访问行为>格式的数据
 * @param actionRDD 用户访问行为RDD
 * @return <sessionid,用户访问行为>格式的数据
 */
private static JavaPairRDD<String, Row> getSessionid2actionRDD(
		JavaRDD<Row> actionRDD) {
	return actionRDD.mapToPair(new PairFunction<Row, String, Row>() {

		private  final Long serialVersionUID = 1L;

		@Override
		public Tuple2<String, Row> call(Row row) throws Exception {
			String sessionid = row.getString(2);
			return new Tuple2<String, Row>(sessionid, row);   
		}
		
	});
}
 
Example 11
Source Project: sparkResearch   Source File: JoinParirRDD.java    License: Apache License 2.0 5 votes vote down vote up
public static void run(JavaSparkContext sparkContext){
    JavaRDD<String> rdd = sparkContext.parallelize(Arrays.asList("test", "java", "python"));
    JavaRDD<String> otherRDD = sparkContext.parallelize(Arrays.asList("golang", "php", "hadoop"));

    PairFunction<String, String, String> pairFunction = new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            return new Tuple2<>(s.split(" ")[0], s);
        }
    };
    JavaPairRDD<String, String> pairRDD = rdd.mapToPair(pairFunction);
    JavaPairRDD<String, String> pairRDDOther = otherRDD.mapToPair(pairFunction);

    pairRDD.sortByKey(false);
}
 
Example 12
Source Project: incubator-nemo   Source File: SparkFrontendUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Converts a {@link PairFunction} to a plain map {@link Function}.
 *
 * @param pairFunction the pair function to convert.
 * @param <T>          the type of original element.
 * @param <K>          the type of converted key.
 * @param <V>          the type of converted value.
 * @return the converted map function.
 */
public static <T, K, V> Function<T, Tuple2<K, V>> pairFunctionToPlainFunction(
  final PairFunction<T, K, V> pairFunction) {
  return new Function<T, Tuple2<K, V>>() {
    @Override
    public Tuple2<K, V> call(final T elem) throws Exception {
      return pairFunction.call(elem);
    }
  };
}
 
Example 13
public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }
 
Example 14
public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}
 
Example 15
Source Project: SparkDemo   Source File: JavaLogQuery.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}
 
Example 16
Source Project: SparkDemo   Source File: JavaNaiveBayesExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}
 
Example 17
Source Project: SparkDemo   Source File: JavaTC.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}
 
Example 18
Source Project: SparkDemo   Source File: JavaCustomReceiver.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}
 
Example 19
Source Project: hudi   Source File: UpsertPartitioner.java    License: Apache License 2.0 5 votes vote down vote up
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, JavaSparkContext jsc) {

    Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
    if (partitionPaths != null && partitionPaths.size() > 0) {
      JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
      partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
          partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
    }

    return partitionSmallFilesMap;
  }
 
Example 20
Source Project: beam   Source File: TranslationUtils.java    License: Apache License 2.0 5 votes vote down vote up
/** Extract key from a {@link WindowedValue} {@link KV} into a pair. */
public static <K, V>
    PairFunction<WindowedValue<KV<K, V>>, ByteArray, WindowedValue<KV<K, V>>>
        toPairByKeyInWindowedValue(final Coder<K> keyCoder) {
  return windowedKv ->
      new Tuple2<>(
          new ByteArray(CoderHelpers.toByteArray(windowedKv.getValue().getKey(), keyCoder)),
          windowedKv);
}
 
Example 21
Source Project: beam   Source File: TranslationUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * A utility method that adapts {@link PairFunction} to a {@link PairFlatMapFunction} with an
 * {@link Iterator} input. This is particularly useful because it allows to use functions written
 * for mapToPair functions in flatmapToPair functions.
 *
 * @param pairFunction the {@link PairFunction} to adapt.
 * @param <T> the input type.
 * @param <K> the output key type.
 * @param <V> the output value type.
 * @return a {@link PairFlatMapFunction} that accepts an {@link Iterator} as an input and applies
 *     the {@link PairFunction} on every element.
 */
public static <T, K, V> PairFlatMapFunction<Iterator<T>, K, V> pairFunctionToPairFlatMapFunction(
    final PairFunction<T, K, V> pairFunction) {
  return itr ->
      Iterators.transform(
          itr,
          t -> {
            try {
              return pairFunction.call(t);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          });
}
 
Example 22
Source Project: beam   Source File: TranslationUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns a pair function to convert value to bytes via coder.
 *
 * @param coderMap - mapping between TupleTag and a coder
 * @return a pair function to convert value to bytes via coder
 */
public static PairFunction<
        Tuple2<TupleTag<?>, WindowedValue<?>>,
        TupleTag<?>,
        ValueAndCoderLazySerializable<WindowedValue<?>>>
    getTupleTagEncodeFunction(final Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap) {
  return tuple2 -> {
    TupleTag<?> tupleTag = tuple2._1;
    WindowedValue<?> windowedValue = tuple2._2;
    return new Tuple2<>(
        tupleTag, ValueAndCoderLazySerializable.of(windowedValue, coderMap.get(tupleTag)));
  };
}
 
Example 23
Source Project: beam   Source File: TranslationUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns a pair function to convert bytes to value via coder.
 *
 * @param coderMap - mapping between TupleTag and a coder
 * @return a pair function to convert bytes to value via coder
 */
public static PairFunction<
        Tuple2<TupleTag<?>, ValueAndCoderLazySerializable<WindowedValue<?>>>,
        TupleTag<?>,
        WindowedValue<?>>
    getTupleTagDecodeFunction(final Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap) {
  return tuple2 -> {
    TupleTag<?> tupleTag = tuple2._1;
    ValueAndCoderLazySerializable<WindowedValue<?>> windowedByteValue = tuple2._2;
    return new Tuple2<>(tupleTag, windowedByteValue.getOrDecode(coderMap.get(tupleTag)));
  };
}
 
Example 24
Source Project: beam   Source File: CoderHelpers.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * A function wrapper for converting a byte array pair to a key-value pair, where values are
 * {@link Iterable}.
 *
 * @param keyCoder Coder to deserialize keys.
 * @param valueCoder Coder to deserialize values.
 * @param <K> The type of the key being deserialized.
 * @param <V> The type of the value being deserialized.
 * @return A function that accepts a pair of byte arrays and returns a key-value pair.
 */
public static <K, V>
    PairFunction<Tuple2<ByteArray, Iterable<byte[]>>, K, Iterable<V>> fromByteFunctionIterable(
        final Coder<K> keyCoder, final Coder<V> valueCoder) {
  return tuple ->
      new Tuple2<>(
          fromByteArray(tuple._1().getValue(), keyCoder),
          StreamSupport.stream(tuple._2().spliterator(), false)
              .map(bytes -> fromByteArray(bytes, valueCoder))
              .collect(Collectors.toList()));
}
 
Example 25
Source Project: datacollector   Source File: HalfHalfTransformer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TransformResult transform(JavaRDD<Record> javaRDD) {
  List<Record> allRecords = javaRDD.collect();
  JavaRDD<Record> result = jsc.parallelize(allRecords.subList(0, allRecords.size() / 2));
  JavaPairRDD<Record, String> errors = jsc.parallelize(allRecords.subList(allRecords.size() / 2, allRecords.size()))
      .mapToPair(new PairFunction<Record, Record, String>() {
        @Override
        public Tuple2<Record, String> call(Record record) throws Exception {
          return new Tuple2<>(record, ERROR_STRING);
        }
      });
  return new TransformResult(result, errors);
}
 
Example 26
Source Project: datacollector   Source File: OnlyErrorTransformer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TransformResult transform(JavaRDD<Record> javaRDD) {
  JavaPairRDD<Record, String> errors = javaRDD.mapToPair(new PairFunction<Record, Record, String>() {
    @Override
    public Tuple2<Record, String> call(Record record) throws Exception {
      return new Tuple2<>(record, record.get(ERROR_PATH).getValueAsString());
    }
  });

  JavaRDD<Record> result = null;
  return new TransformResult(result, errors);
}
 
Example 27
Source Project: deep-spark   Source File: ExtractorEntityTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Prepares team data to use in join tests. Maps teams to pair with id as key.
 *
 * @param context the deep context
 */
protected JavaPairRDD<Long, TeamEntity> prepareTeamRDD(DeepSparkContext context) {
    ExtractorConfig<TeamEntity> teamsConfigEntity = getReadExtractorConfig(this.databaseExtractorName,
            FOOTBALL_TEAM_INPUT, TeamEntity.class);

    return context.createJavaRDD(teamsConfigEntity)
            .mapToPair
                    (new PairFunction<TeamEntity, Long, TeamEntity>() {
                        @Override
                        public Tuple2<Long, TeamEntity> call(TeamEntity teamEntity)
                                throws Exception {
                            return new Tuple2<>(teamEntity.getId(), teamEntity);
                        }
                    });
}
 
Example 28
Source Project: deep-spark   Source File: ExtractorEntityTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Prepares player data to use in join tests. Maps players to pair with teamId as key.
 *
 * @param context the deep context
 */
protected JavaPairRDD<Long, PlayerEntity> preparePlayerRDD(DeepSparkContext context) {
    ExtractorConfig<PlayerEntity> playersConfigEntity = getReadExtractorConfig(this.databaseExtractorName,
            FOOTBALL_PLAYER_INPUT, PlayerEntity.class);

    return context.createJavaRDD(playersConfigEntity)
            .mapToPair
                    (new PairFunction<PlayerEntity, Long, PlayerEntity>() {
                        @Override
                        public Tuple2<Long, PlayerEntity> call(PlayerEntity playerEntity)
                                throws Exception {
                            return new Tuple2<>(playerEntity.getTeamId(), playerEntity);
                        }
                    });
}
 
Example 29
Source Project: examples   Source File: CountLines.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}
 
Example 30
Source Project: deeplearning4j   Source File: SparkUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static <T> JavaPairRDD<Integer, T> indexedRDD(JavaRDD<T> rdd) {
    return rdd.zipWithIndex().mapToPair(new PairFunction<Tuple2<T, Long>, Integer, T>() {
        @Override
        public Tuple2<Integer, T> call(Tuple2<T, Long> elemIdx) {
            return new Tuple2<>(elemIdx._2().intValue(), elemIdx._1());
        }
    });
}