org.apache.spark.api.java.JavaRDD#mapPartitionsToPair

Source File: SparkUtils.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Equivalent to {@link #balancedRandomSplit(int, int, JavaRDD)} but for pair RDDs, and with control over the RNG seed
 */
public static <T, U> JavaPairRDD<T, U>[] balancedRandomSplit(int totalObjectCount, int numObjectsPerSplit,
                JavaPairRDD<T, U> data, long rngSeed) {
    JavaPairRDD<T, U>[] splits;
    if (totalObjectCount <= numObjectsPerSplit) {
        splits = (JavaPairRDD<T, U>[]) Array.newInstance(JavaPairRDD.class, 1);
        splits[0] = data;
    } else {
        int numSplits = totalObjectCount / numObjectsPerSplit; //Intentional round down

        splits = (JavaPairRDD<T, U>[]) Array.newInstance(JavaPairRDD.class, numSplits);
        for (int i = 0; i < numSplits; i++) {

            //What we really need is a .mapPartitionsToPairWithIndex function
            //but, of course Spark doesn't provide this
            //So we need to do a two-step process here...

            JavaRDD<Tuple2<T, U>> split = data.mapPartitionsWithIndex(
                            new SplitPartitionsFunction2<T, U>(i, numSplits, rngSeed), true);
            splits[i] = split.mapPartitionsToPair(new MapTupleToPairFlatMap<T, U>(), true);
        }
    }
    return splits;
}

Source File: UserVisitSessionAnalyzeSpark.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
	 * 获取sessionid2到访问行为数据的映射的RDD
	 * @param actionRDD
	 * @return
	 */
	public static JavaPairRDD<String, Row> getSessionid2ActionRDD(JavaRDD<Row> actionRDD) {
//		return actionRDD.mapToPair(new PairFunction<Row, String, Row>() {
//
//			private static final long serialVersionUID = 1L;
//
//			@Override
//			public Tuple2<String, Row> call(Row row) throws Exception {
//				return new Tuple2<String, Row>(row.getString(2), row);
//			}
//
//		});

		return actionRDD.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Row>, String, Row>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Iterator<Tuple2<String, Row>> call(Iterator<Row> iterator)
					throws Exception {
				List<Tuple2<String, Row>> list = new ArrayList<Tuple2<String, Row>>();

				while(iterator.hasNext()) {
					Row row = iterator.next();
					list.add(new Tuple2<String, Row>(row.getString(2), row));
				}

				return list.iterator();
			}

		});
	}

Source File: HDFSWriter.java From ViraPipe with MIT License

5 votes

private static JavaPairRDD<Text, SequencedFragment> alignmentsToFastq(JavaRDD<String> alignmentRDD, SAMFileHeader header) {
    return alignmentRDD.mapPartitionsToPair(alns -> {

        List<Tuple2<Text, SequencedFragment>> records = new ArrayList<Tuple2<Text, SequencedFragment>>();
        final SAMLineParser samLP = new SAMLineParser(new DefaultSAMRecordFactory(), ValidationStringency.SILENT, header, null, null);
        while (alns.hasNext()) {
            String aln = alns.next().replace("\r\n", "").replace("\n", "").replace(System.lineSeparator(), "");
            try{
                SAMRecord sam = samLP.parseLine(aln);
                String[] fields = aln.split("\\t");
                String name = fields[0];
                if(sam.getReadPairedFlag()){
                    if(sam.getFirstOfPairFlag())
                        name = name+"/1";
                    if(sam.getSecondOfPairFlag())
                        name = name+"/2";
                }

                String bases = fields[9];
                String quality = fields[10];

                Text t = new Text(name);
                SequencedFragment sf = new SequencedFragment();
                sf.setSequence(new Text(bases));
                sf.setQuality(new Text(quality));
                records.add(new Tuple2<Text, SequencedFragment>(t, sf));
            }catch(SAMFormatException e){
                System.out.println(e.getMessage().toString());
            }
        }
        return records.iterator();
    });
}

Source File: JoinReadsWithVariants.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Joins each read of an RDD<GATKRead> with overlapping variants from a list of variants files.
 *
 * @param reads the RDD of reads, in coordinate-sorted order
 * @param variantsFileNames the names of the variants files added via {@code SparkContext#addFile()}
 * @return an RDD that contains each read along with the overlapping variants
 */
public static JavaPairRDD<GATKRead, Iterable<GATKVariant>> join(final JavaRDD<GATKRead> reads, final List<String> variantsFileNames) {
    return reads.mapPartitionsToPair((PairFlatMapFunction<Iterator<GATKRead>, GATKRead, Iterable<GATKVariant>>) gatkReadIterator -> {
        List<FeatureDataSource<VariantContext>> variantSources = variantsFileNames.stream().map(fileName -> openFeatureSource(SparkFiles.get(fileName))).collect(Collectors.toList());
        Iterator<Tuple2<GATKRead, Iterable<GATKVariant>>> iterator = Iterators.transform(gatkReadIterator, read -> getVariantsOverlappingRead(read, variantSources));
        return new CloseAtEndIterator<>(iterator, new AutoCloseableCollection(variantSources)); // close FeatureDataSource at end of iteration
    });
}

Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static <T> void persistsPartition(JavaRDD<MessageAndMetadata<T>> rdd, Properties props) throws Exception {
      JavaPairRDD<String,Long> partitionOffsetRdd = rdd.mapPartitionsToPair(new PartitionOffsetPair<>());
      JavaPairRDD<String, Iterable<Long>> partitonOffset = partitionOffsetRdd.groupByKey(1);
      List<Tuple2<String, Iterable<Long>>> poList = partitonOffset.collect();
      doPersists(poList, props);
}

Source File: PSScorer.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

public JavaRDD<GATKRead> scoreReads(final JavaSparkContext ctx,
                                    final JavaRDD<GATKRead> pairedReads,
                                    final JavaRDD<GATKRead> unpairedReads,
                                    final SAMFileHeader header) {

    //Group reads into pairs
    final JavaRDD<Iterable<GATKRead>> groupedReads = groupReadsIntoPairs(pairedReads,
            unpairedReads, scoreArgs.readsPerPartitionEstimate);

    //Load taxonomy database, created by running PathSeqBuildReferenceTaxonomy with this reference
    final PSTaxonomyDatabase taxDB = readTaxonomyDatabase(scoreArgs.taxonomyDatabasePath);
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxDB);

    //Check header against database
    if (scoreArgs.headerWarningFile != null) {
        writeMissingReferenceAccessions(scoreArgs.headerWarningFile, header, taxDB, logger);
    }

    //Determine which alignments are valid hits and return their tax IDs in PSPathogenAlignmentHit
    //Also adds pathseq tags containing the hit IDs to the reads
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> readHits = mapGroupedReadsToTax(groupedReads,
            scoreArgs.minIdentity, scoreArgs.identityMargin, taxonomyDatabaseBroadcast);

    //Get the original reads, now with their pathseq hit tags set
    final JavaRDD<GATKRead> readsFinal = flattenIterableKeys(readHits);

    //Compute taxonomic scores from the alignment hits
    final JavaRDD<PSPathogenAlignmentHit> alignmentHits = readHits.map(Tuple2::_2);
    final boolean divideByGenomeLength = scoreArgs.divideByGenomeLength; //To prevent serialization of PSScorer
    final JavaPairRDD<Integer, PSPathogenTaxonScore> taxScoresRdd = alignmentHits
            .mapPartitionsToPair(iter -> computeTaxScores(iter, taxonomyDatabaseBroadcast.value(), divideByGenomeLength));

    //Reduce scores by taxon and compute normalized scores
    Map<Integer, PSPathogenTaxonScore> taxScoresMap = new HashMap<>(taxScoresRdd.reduceByKey(PSPathogenTaxonScore::add).collectAsMap());
    taxScoresMap = computeNormalizedScores(taxScoresMap, taxDB.tree, scoreArgs.notNormalizedByKingdom);

    //Write scores to file
    writeScoresFile(taxScoresMap, taxDB.tree, scoreArgs.scoresPath);

    return readsFinal;
}

Java Code Examples for org.apache.spark.api.java.JavaRDD#mapPartitionsToPair()