Java Code Examples for org.apache.spark.api.java.JavaRDD#mapPartitionsToPair()

The following examples show how to use org.apache.spark.api.java.JavaRDD#mapPartitionsToPair() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Equivalent to {@link #balancedRandomSplit(int, int, JavaRDD)} but for pair RDDs, and with control over the RNG seed
 */
public static <T, U> JavaPairRDD<T, U>[] balancedRandomSplit(int totalObjectCount, int numObjectsPerSplit,
                JavaPairRDD<T, U> data, long rngSeed) {
    JavaPairRDD<T, U>[] splits;
    if (totalObjectCount <= numObjectsPerSplit) {
        splits = (JavaPairRDD<T, U>[]) Array.newInstance(JavaPairRDD.class, 1);
        splits[0] = data;
    } else {
        int numSplits = totalObjectCount / numObjectsPerSplit; //Intentional round down

        splits = (JavaPairRDD<T, U>[]) Array.newInstance(JavaPairRDD.class, numSplits);
        for (int i = 0; i < numSplits; i++) {

            //What we really need is a .mapPartitionsToPairWithIndex function
            //but, of course Spark doesn't provide this
            //So we need to do a two-step process here...

            JavaRDD<Tuple2<T, U>> split = data.mapPartitionsWithIndex(
                            new SplitPartitionsFunction2<T, U>(i, numSplits, rngSeed), true);
            splits[i] = split.mapPartitionsToPair(new MapTupleToPairFlatMap<T, U>(), true);
        }
    }
    return splits;
}
 
Example 2
Source File: UserVisitSessionAnalyzeSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
	 * 获取sessionid2到访问行为数据的映射的RDD
	 * @param actionRDD
	 * @return
	 */
	public static JavaPairRDD<String, Row> getSessionid2ActionRDD(JavaRDD<Row> actionRDD) {
//		return actionRDD.mapToPair(new PairFunction<Row, String, Row>() {
//
//			private static final long serialVersionUID = 1L;
//
//			@Override
//			public Tuple2<String, Row> call(Row row) throws Exception {
//				return new Tuple2<String, Row>(row.getString(2), row);
//			}
//
//		});

		return actionRDD.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Row>, String, Row>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Iterator<Tuple2<String, Row>> call(Iterator<Row> iterator)
					throws Exception {
				List<Tuple2<String, Row>> list = new ArrayList<Tuple2<String, Row>>();

				while(iterator.hasNext()) {
					Row row = iterator.next();
					list.add(new Tuple2<String, Row>(row.getString(2), row));
				}

				return list.iterator();
			}

		});
	}
 
Example 3
Source File: HDFSWriter.java    From ViraPipe with MIT License 5 votes vote down vote up
private static JavaPairRDD<Text, SequencedFragment> alignmentsToFastq(JavaRDD<String> alignmentRDD, SAMFileHeader header) {
    return alignmentRDD.mapPartitionsToPair(alns -> {

        List<Tuple2<Text, SequencedFragment>> records = new ArrayList<Tuple2<Text, SequencedFragment>>();
        final SAMLineParser samLP = new SAMLineParser(new DefaultSAMRecordFactory(), ValidationStringency.SILENT, header, null, null);
        while (alns.hasNext()) {
            String aln = alns.next().replace("\r\n", "").replace("\n", "").replace(System.lineSeparator(), "");
            try{
                SAMRecord sam = samLP.parseLine(aln);
                String[] fields = aln.split("\\t");
                String name = fields[0];
                if(sam.getReadPairedFlag()){
                    if(sam.getFirstOfPairFlag())
                        name = name+"/1";
                    if(sam.getSecondOfPairFlag())
                        name = name+"/2";
                }

                String bases = fields[9];
                String quality = fields[10];

                Text t = new Text(name);
                SequencedFragment sf = new SequencedFragment();
                sf.setSequence(new Text(bases));
                sf.setQuality(new Text(quality));
                records.add(new Tuple2<Text, SequencedFragment>(t, sf));
            }catch(SAMFormatException e){
                System.out.println(e.getMessage().toString());
            }
        }
        return records.iterator();
    });
}
 
Example 4
Source File: JoinReadsWithVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Joins each read of an RDD<GATKRead> with overlapping variants from a list of variants files.
 *
 * @param reads the RDD of reads, in coordinate-sorted order
 * @param variantsFileNames the names of the variants files added via {@code SparkContext#addFile()}
 * @return an RDD that contains each read along with the overlapping variants
 */
public static JavaPairRDD<GATKRead, Iterable<GATKVariant>> join(final JavaRDD<GATKRead> reads, final List<String> variantsFileNames) {
    return reads.mapPartitionsToPair((PairFlatMapFunction<Iterator<GATKRead>, GATKRead, Iterable<GATKVariant>>) gatkReadIterator -> {
        List<FeatureDataSource<VariantContext>> variantSources = variantsFileNames.stream().map(fileName -> openFeatureSource(SparkFiles.get(fileName))).collect(Collectors.toList());
        Iterator<Tuple2<GATKRead, Iterable<GATKVariant>>> iterator = Iterators.transform(gatkReadIterator, read -> getVariantsOverlappingRead(read, variantSources));
        return new CloseAtEndIterator<>(iterator, new AutoCloseableCollection(variantSources)); // close FeatureDataSource at end of iteration
    });
}
 
Example 5
Source File: ProcessedOffsetManager.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static <T> void persistsPartition(JavaRDD<MessageAndMetadata<T>> rdd, Properties props) throws Exception {
      JavaPairRDD<String,Long> partitionOffsetRdd = rdd.mapPartitionsToPair(new PartitionOffsetPair<>());
      JavaPairRDD<String, Iterable<Long>> partitonOffset = partitionOffsetRdd.groupByKey(1);
      List<Tuple2<String, Iterable<Long>>> poList = partitonOffset.collect();
      doPersists(poList, props);
}
 
Example 6
Source File: PSScorer.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public JavaRDD<GATKRead> scoreReads(final JavaSparkContext ctx,
                                    final JavaRDD<GATKRead> pairedReads,
                                    final JavaRDD<GATKRead> unpairedReads,
                                    final SAMFileHeader header) {

    //Group reads into pairs
    final JavaRDD<Iterable<GATKRead>> groupedReads = groupReadsIntoPairs(pairedReads,
            unpairedReads, scoreArgs.readsPerPartitionEstimate);

    //Load taxonomy database, created by running PathSeqBuildReferenceTaxonomy with this reference
    final PSTaxonomyDatabase taxDB = readTaxonomyDatabase(scoreArgs.taxonomyDatabasePath);
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxDB);

    //Check header against database
    if (scoreArgs.headerWarningFile != null) {
        writeMissingReferenceAccessions(scoreArgs.headerWarningFile, header, taxDB, logger);
    }

    //Determine which alignments are valid hits and return their tax IDs in PSPathogenAlignmentHit
    //Also adds pathseq tags containing the hit IDs to the reads
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> readHits = mapGroupedReadsToTax(groupedReads,
            scoreArgs.minIdentity, scoreArgs.identityMargin, taxonomyDatabaseBroadcast);

    //Get the original reads, now with their pathseq hit tags set
    final JavaRDD<GATKRead> readsFinal = flattenIterableKeys(readHits);

    //Compute taxonomic scores from the alignment hits
    final JavaRDD<PSPathogenAlignmentHit> alignmentHits = readHits.map(Tuple2::_2);
    final boolean divideByGenomeLength = scoreArgs.divideByGenomeLength; //To prevent serialization of PSScorer
    final JavaPairRDD<Integer, PSPathogenTaxonScore> taxScoresRdd = alignmentHits
            .mapPartitionsToPair(iter -> computeTaxScores(iter, taxonomyDatabaseBroadcast.value(), divideByGenomeLength));

    //Reduce scores by taxon and compute normalized scores
    Map<Integer, PSPathogenTaxonScore> taxScoresMap = new HashMap<>(taxScoresRdd.reduceByKey(PSPathogenTaxonScore::add).collectAsMap());
    taxScoresMap = computeNormalizedScores(taxScoresMap, taxDB.tree, scoreArgs.notNormalizedByKingdom);

    //Write scores to file
    writeScoresFile(taxScoresMap, taxDB.tree, scoreArgs.scoresPath);

    return readsFinal;
}