Java Code Examples for org.apache.spark.api.java.JavaRDD#union()

The following examples show how to use org.apache.spark.api.java.JavaRDD#union() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}
 
Example 2
Source File: Union.java    From SparkDemo with MIT License 6 votes vote down vote up
static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD,不去重,要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}
 
Example 3
Source File: SparkOperatorProfiler.java    From rheem with Apache License 2.0 6 votes vote down vote up
/**
 * Helper method to generate data quanta and provide them as a cached {@link JavaRDD}.
 */
protected <T> JavaRDD<T> prepareInputRddInDriver(long cardinality, int inputIndex) {
    @SuppressWarnings("unchecked")
    final Supplier<T> supplier = (Supplier<T>) this.dataQuantumGenerators.get(inputIndex);
    JavaRDD<T> finalInputRdd = null;

    // Create batches, parallelize them, and union them.
    long remainder = cardinality;
    do {
        int batchSize = (int) Math.min(remainder, this.dataQuantumGeneratorBatchSize);
        List<T> batch = new ArrayList<>(batchSize);
        while (batch.size() < batchSize) {
            batch.add(supplier.get());
        }
        final JavaRDD<T> batchRdd = this.sparkExecutor.sc.parallelize(batch);
        finalInputRdd = finalInputRdd == null ? batchRdd : finalInputRdd.union(batchRdd);
        remainder -= batchSize;
    } while (remainder > 0);

    // Shuffle and cache the RDD.
    final JavaRDD<T> cachedInputRdd = this.partition(finalInputRdd).cache();
    cachedInputRdd.foreach(dataQuantum -> {
    });

    return cachedInputRdd;
}
 
Example 4
Source File: GATKSparkTool.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Loads the reads into a {@link JavaRDD} using the intervals specified, and returns them
 * without applying any filtering.
 *
 * If no intervals were specified, returns all the reads (both mapped and unmapped).
 *
 * @return all reads from our reads input(s) as a {@link JavaRDD}, bounded by intervals if specified, and unfiltered.
 */
public JavaRDD<GATKRead> getUnfilteredReads() {
    final TraversalParameters traversalParameters;
    if ( hasUserSuppliedIntervals() ) { // intervals may have been supplied by editIntervals
        final boolean traverseUnmapped;
        if (intervalArgumentCollection.intervalsSpecified()) {
            traverseUnmapped = intervalArgumentCollection.getTraversalParameters(getHeaderForReads().getSequenceDictionary()).traverseUnmappedReads();
        } else {
            traverseUnmapped = false;
        }
        traversalParameters = new TraversalParameters(getIntervals(), traverseUnmapped);
    } else {
        traversalParameters = null;
    }

    JavaRDD<GATKRead> output = null;
    ReadsSparkSource source = readsSource;
    for (final GATKPath inputPathSpecifier : readInputs.keySet()) {
        if (output == null) {
            output = getGatkReadJavaRDD(traversalParameters, source, inputPathSpecifier);
        } else {
            output = output.union(getGatkReadJavaRDD(traversalParameters, source, inputPathSpecifier));
        }
    }
    return output;
}
 
Example 5
Source File: PSScorer.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Moves reads from the same read template into an Iterable.
 * Paired reads must be queryname-sorted, and no pair of reads can be split across partitions.
 */
static JavaRDD<Iterable<GATKRead>> groupReadsIntoPairs(final JavaRDD<GATKRead> pairedReads,
                                                       final JavaRDD<GATKRead> unpairedReads,
                                                       final int readsPerPartitionGuess) {
    JavaRDD<Iterable<GATKRead>> groupedReads;
    if (pairedReads != null) {
        groupedReads = pairedReads.mapPartitions(iter -> groupPairedReadsPartition(iter, readsPerPartitionGuess));
        if (unpairedReads != null) {
            groupedReads = groupedReads.union(unpairedReads.map(Collections::singletonList));
        }
    } else if (unpairedReads != null) {
        groupedReads = unpairedReads.map(Collections::singletonList);
    } else {
        throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid.");
    }
    return groupedReads;
}
 
Example 6
Source File: MLUpdate.java    From oryx with Apache License 2.0 6 votes vote down vote up
private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) {
  Objects.requireNonNull(newData);
  if (testFraction <= 0.0) {
    return new Pair<>(pastData == null ? newData : newData.union(pastData), null);
  }
  if (testFraction >= 1.0) {
    return new Pair<>(pastData, newData);
  }
  if (empty(newData)) {
    return new Pair<>(pastData, null);
  }
  Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData);
  JavaRDD<M> newTrainData = newTrainTest.getFirst();
  return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData),
                    newTrainTest.getSecond());
}
 
Example 7
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 集合并集.
 * demo计算目的:找出所有进站是广南和天河客运站的信息
 *
 * @since hui_project 1.0.0
 */
public void testUnionAndFilter() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站"));
    JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站"));
    JavaRDD<String> union = result.union(result1);
    System.out.println("-------" + union.count() + "-------");
    checkResult(union.collect());
}
 
Example 8
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 集合并集.
 * demo计算目的:找出所有进站是广南和天河客运站的信息
 * @since hui_project 1.0.0
 */
@Test
public void testUnionAndFilter() {
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站"));
    JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站"));
    JavaRDD<String> union = result.union(result1);
    System.out.println("-------" + union.count() + "-------");
    checkResult(union.collect());
}
 
Example 9
Source File: QuadUtils.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Get resources related to specified resources, computed by querying an in-memory set of subjects
 *
 * @param quads       RDD of quads to filter
 * @param subjectURIs set of requested subject URIs to grow from
 * @param directed    whether to use both directions of relations
 * @return URIs of resources related to specified resources
 */
public static JavaRDD<String> getNeighborResources(JavaRDD<Quad> quads, Set<String> subjectURIs, boolean directed) {
    JavaRDD<String> neighbors = filterQuadsByAllowedSubjects(quads, subjectURIs)
            .filter(quad -> quad.getObject().isURI())
            .map(quad -> quad.getObject().getURI());
    if (!directed) {
        neighbors = neighbors.union(filterQuadsByObjects(quads, subjectURIs)
                .filter(quad -> quad.getSubject().isURI())
                .map(quad -> quad.getSubject().getURI()));
    }
    return neighbors;
}
 
Example 10
Source File: SparkFileInputStream.java    From incubator-retired-mrql with Apache License 2.0 5 votes vote down vote up
@Override
public Option<RDD<MRData>> compute ( Time validTime ) {
    JavaRDD<MRData> rdd = null;
    for ( String file: new_files() )
        if (rdd == null)
            rdd = hadoopFile(file);
        else rdd = rdd.union(hadoopFile(file));
    if (rdd == null)
        rdd = SparkEvaluator.spark_context.emptyRDD();
    return new Some<RDD<MRData>>(rdd.rdd());
}
 
Example 11
Source File: MultiReturnParameterizedBuiltinSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}
 
Example 12
Source File: MultiReturnParameterizedBuiltinSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}