Java Code Examples for org.apache.spark.api.java.JavaPairRDD#join()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#join() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 6 votes vote down vote up
/**
 * 集合关联. 合并相同key的value
 * demo计算目的:今年和去年都获奖的同学,获奖项的科目都有哪些
 *
 * @since hui_project 1.0.0
 */
public void testJoin() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    //今年同学获奖的科目
    JavaPairRDD<Object, Object> rdd1 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "语文")
            , new Tuple2("xiaoming", "数学")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "艺术")
            , new Tuple2("test", "艺术")))
            .mapToPair(x -> new Tuple2<>(x._1, x._2));
    //去年同学获奖的科目
    JavaPairRDD<Object, Object> rdd2 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "艺术")
            , new Tuple2("lihua", "艺术")
            , new Tuple2("xiaofeng", "语文")))
            .mapToPair(x -> new Tuple2<>(x._1, x._2));
    JavaPairRDD<Object, Tuple2<Object, Object>> join = rdd1.join(rdd2);
    checkResult(join.collect());
}
 
Example 2
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 6 votes vote down vote up
/**
 * 集合关联. 合并相同key的value
 * demo计算目的:今年和去年都获奖的同学,获奖项的科目都有哪些
 * @since hui_project 1.0.0
 */
@Test
public void testJoin() {
    //今年同学获奖的科目
    JavaPairRDD<Object, Object> rdd1 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "语文")
            , new Tuple2("xiaoming", "数学")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "艺术")
            , new Tuple2("test", "艺术")))
            .mapToPair(x -> new Tuple2<>(x._1, x._2));
    //去年同学获奖的科目
    JavaPairRDD<Object, Object> rdd2 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "艺术")
            , new Tuple2("lihua", "艺术")
            , new Tuple2("xiaofeng", "语文")))
            .mapToPair(x -> new Tuple2<>(x._1, x._2));
    JavaPairRDD<Object, Tuple2<Object, Object>> join = rdd1.join(rdd2);
    checkResult(join.collect());
}
 
Example 3
Source File: SparkTableChecker.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Check for duplicate indexes
 * @param index
 * @return
 * @throws StandardException
 * @throws InterruptedException
 * @throws ExecutionException
 */
private List<String> checkDuplicateIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    try {
        SpliceSpark.pushScope(String.format("Check duplicates in index %s.%s", schemaName, indexName));
        JavaPairRDD duplicateIndexRdd = ((SparkPairDataSet) index).rdd
                .combineByKey(new CreateCombiner(), new MergeValue(), new MergeCombiners())
                .filter(new DuplicateIndexFilter());

        JavaPairRDD joinedRdd = duplicateIndexRdd
                .join(((SparkPairDataSet) table).rdd);

        JavaRDD duplicateIndex = joinedRdd
                .mapPartitions(new SparkFlatMapFunction<>(new DeleteDuplicateIndexFunction<>(conglomerate, txn, tentativeIndex, baseColumnMap, fix)));

        Iterator it = duplicateIndex.toLocalIterator();
        long count = duplicateIndex.count();
        return reportDuplicateIndexes(it, count, fix);
    }catch (Exception e) {
        throw StandardException.plainWrapException(e);
    }
    finally {
        SpliceSpark.popScope();
    }
}
 
Example 4
Source File: CumulativeOffsetSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}
 
Example 5
Source File: CumulativeOffsetSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}
 
Example 6
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
 * 生成点击商品基础信息临时表
 * @param sqlContext
 * @param cityid2clickActionRDD
 * @param cityid2cityInfoRDD
 */
private static void generateTempClickProductBasicTable(
		SQLContext sqlContext,
		JavaPairRDD<Long, Row> cityid2clickActionRDD,
		JavaPairRDD<Long, Row> cityid2cityInfoRDD) {
	// 执行join操作,进行点击行为数据和城市数据的关联
	JavaPairRDD<Long, Tuple2<Row, Row>> joinedRDD =
			cityid2clickActionRDD.join(cityid2cityInfoRDD);
	
	// 将上面的JavaPairRDD,转换成一个JavaRDD<Row>(才能将RDD转换为Dataset<Row>)
	JavaRDD<Row> mappedRDD = joinedRDD.map(
			
			new Function<Tuple2<Long,Tuple2<Row,Row>>, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple)
						throws Exception {
					Long cityid = tuple._1;
					Row clickAction = tuple._2._1;
					Row cityInfo = tuple._2._2;
					
					Long productid = clickAction.getLong(1);
					String cityName = cityInfo.getString(1);
					String area = cityInfo.getString(2);
					
					return RowFactory.create(cityid, cityName, area, productid);  
				}
				
			});
	
	// 基于JavaRDD<Row>的格式,就可以将其转换为Dataset<Row>
	List<StructField> structFields = new ArrayList<StructField>();
	structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType, true));
	structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("area", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType, true));
	
	// 1 北京
	// 2 上海
	// 1 北京
	// group by area,product_id
	// 1:北京,2:上海
	
	// 两个函数
	// UDF:concat2(),将两个字段拼接起来,用指定的分隔符
	// UDAF:group_concat_distinct(),将一个分组中的多个字段值,用逗号拼接起来,同时进行去重
	
	StructType schema = DataTypes.createStructType(structFields);

	Dataset<Row> df = sqlContext.createDataFrame(mappedRDD, schema);
	System.out.println("tmp_click_product_basic: " + df.count());  
	
	// 将Dataset<Row>中的数据,注册成临时表(tmp_click_product_basic)
	df.registerTempTable("tmp_click_product_basic");  
}
 
Example 7
Source File: ExtractorEntityTest.java    From deep-spark with Apache License 2.0 4 votes vote down vote up
/**
 * It tests if the extractor can join two data sets
 */
@Test
protected void testInnerJoin() {
    DeepSparkContext context = getDeepSparkContext();

    try {

        JavaPairRDD<Long, TeamEntity> teamsRDD = prepareTeamRDD(context);

        JavaPairRDD<Long, Iterable<PlayerEntity>> playersRDD = preparePlayerRDD(context).groupByKey();

        JavaPairRDD<Long, Tuple2<TeamEntity, Iterable<PlayerEntity>>> joinRDD = teamsRDD.join(playersRDD);

        assertEquals(joinRDD.count(), 4);

    } finally {
        context.stop();
    }
}
 
Example 8
Source File: SkewedJoinConverter.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
                          POSkewedJoin poSkewedJoin) throws IOException {

    SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2);
    LRs = new POLocalRearrange[2];
    this.poSkewedJoin = poSkewedJoin;

    createJoinPlans(poSkewedJoin.getJoinPlans());

    // extract the two RDDs
    RDD<Tuple> rdd1 = predecessors.get(0);
    RDD<Tuple> rdd2 = predecessors.get(1);

    // make (key, value) pairs, key has type Object, value has type Tuple
    RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction(
            this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest());
    RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction(
            this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest());

    // join fn is present in JavaPairRDD class ..
    JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
            rdd1Pair, SparkUtil.getManifest(Object.class),
            SparkUtil.getManifest(Tuple.class));
    JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
            rdd2Pair, SparkUtil.getManifest(Object.class),
            SparkUtil.getManifest(Tuple.class));

    // do the join
    JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD
            .join(rdd2Pair_javaRDD);

    // map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by
    // ignoring the key (of type Object) and appending the values (the
    // Tuples)
    JavaRDD<Tuple> result = result_KeyValue
            .mapPartitions(new ToValueFunction());

    // return type is RDD<Tuple>, so take it from JavaRDD<Tuple>
    return result.rdd();
}
 
Example 9
Source File: ParamservUtils.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Assemble the matrix of features and labels according to the rowID
 *
 * @param featuresRDD indexed features matrix block
 * @param labelsRDD indexed labels matrix block
 * @return Assembled rdd with rowID as key while matrix of features and labels as value (rowID {@literal ->} features, labels)
 */
public static JavaPairRDD<Long, Tuple2<MatrixBlock, MatrixBlock>> assembleTrainingData(JavaPairRDD<MatrixIndexes, MatrixBlock> featuresRDD, JavaPairRDD<MatrixIndexes, MatrixBlock> labelsRDD) {
	JavaPairRDD<Long, MatrixBlock> fRDD = groupMatrix(featuresRDD);
	JavaPairRDD<Long, MatrixBlock> lRDD = groupMatrix(labelsRDD);
	//TODO Add an additional physical operator which broadcasts the labels directly (broadcast join with features) if certain memory budgets are satisfied
	return fRDD.join(lRDD);
}
 
Example 10
Source File: ParamservUtils.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Assemble the matrix of features and labels according to the rowID
 *
 * @param featuresRDD indexed features matrix block
 * @param labelsRDD indexed labels matrix block
 * @return Assembled rdd with rowID as key while matrix of features and labels as value (rowID {@literal ->} features, labels)
 */
public static JavaPairRDD<Long, Tuple2<MatrixBlock, MatrixBlock>> assembleTrainingData(JavaPairRDD<MatrixIndexes, MatrixBlock> featuresRDD, JavaPairRDD<MatrixIndexes, MatrixBlock> labelsRDD) {
	JavaPairRDD<Long, MatrixBlock> fRDD = groupMatrix(featuresRDD);
	JavaPairRDD<Long, MatrixBlock> lRDD = groupMatrix(labelsRDD);
	//TODO Add an additional physical operator which broadcasts the labels directly (broadcast join with features) if certain memory budgets are satisfied
	return fRDD.join(lRDD);
}