Java Code Examples for org.apache.spark.api.java.JavaRDD#mapPartitionsToPair()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#mapPartitionsToPair() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkUtils.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Equivalent to {@link #balancedRandomSplit(int, int, JavaRDD)} but for pair RDDs, and with control over the RNG seed */ public static <T, U> JavaPairRDD<T, U>[] balancedRandomSplit(int totalObjectCount, int numObjectsPerSplit, JavaPairRDD<T, U> data, long rngSeed) { JavaPairRDD<T, U>[] splits; if (totalObjectCount <= numObjectsPerSplit) { splits = (JavaPairRDD<T, U>[]) Array.newInstance(JavaPairRDD.class, 1); splits[0] = data; } else { int numSplits = totalObjectCount / numObjectsPerSplit; //Intentional round down splits = (JavaPairRDD<T, U>[]) Array.newInstance(JavaPairRDD.class, numSplits); for (int i = 0; i < numSplits; i++) { //What we really need is a .mapPartitionsToPairWithIndex function //but, of course Spark doesn't provide this //So we need to do a two-step process here... JavaRDD<Tuple2<T, U>> split = data.mapPartitionsWithIndex( new SplitPartitionsFunction2<T, U>(i, numSplits, rngSeed), true); splits[i] = split.mapPartitionsToPair(new MapTupleToPairFlatMap<T, U>(), true); } } return splits; }
Example 2
Source File: UserVisitSessionAnalyzeSpark.java From BigDataPlatform with GNU General Public License v3.0 | 5 votes |
/** * 获取sessionid2到访问行为数据的映射的RDD * @param actionRDD * @return */ public static JavaPairRDD<String, Row> getSessionid2ActionRDD(JavaRDD<Row> actionRDD) { // return actionRDD.mapToPair(new PairFunction<Row, String, Row>() { // // private static final long serialVersionUID = 1L; // // @Override // public Tuple2<String, Row> call(Row row) throws Exception { // return new Tuple2<String, Row>(row.getString(2), row); // } // // }); return actionRDD.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Row>, String, Row>() { private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<String, Row>> call(Iterator<Row> iterator) throws Exception { List<Tuple2<String, Row>> list = new ArrayList<Tuple2<String, Row>>(); while(iterator.hasNext()) { Row row = iterator.next(); list.add(new Tuple2<String, Row>(row.getString(2), row)); } return list.iterator(); } }); }
Example 3
Source File: HDFSWriter.java From ViraPipe with MIT License | 5 votes |
private static JavaPairRDD<Text, SequencedFragment> alignmentsToFastq(JavaRDD<String> alignmentRDD, SAMFileHeader header) { return alignmentRDD.mapPartitionsToPair(alns -> { List<Tuple2<Text, SequencedFragment>> records = new ArrayList<Tuple2<Text, SequencedFragment>>(); final SAMLineParser samLP = new SAMLineParser(new DefaultSAMRecordFactory(), ValidationStringency.SILENT, header, null, null); while (alns.hasNext()) { String aln = alns.next().replace("\r\n", "").replace("\n", "").replace(System.lineSeparator(), ""); try{ SAMRecord sam = samLP.parseLine(aln); String[] fields = aln.split("\\t"); String name = fields[0]; if(sam.getReadPairedFlag()){ if(sam.getFirstOfPairFlag()) name = name+"/1"; if(sam.getSecondOfPairFlag()) name = name+"/2"; } String bases = fields[9]; String quality = fields[10]; Text t = new Text(name); SequencedFragment sf = new SequencedFragment(); sf.setSequence(new Text(bases)); sf.setQuality(new Text(quality)); records.add(new Tuple2<Text, SequencedFragment>(t, sf)); }catch(SAMFormatException e){ System.out.println(e.getMessage().toString()); } } return records.iterator(); }); }
Example 4
Source File: JoinReadsWithVariants.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Joins each read of an RDD<GATKRead> with overlapping variants from a list of variants files. * * @param reads the RDD of reads, in coordinate-sorted order * @param variantsFileNames the names of the variants files added via {@code SparkContext#addFile()} * @return an RDD that contains each read along with the overlapping variants */ public static JavaPairRDD<GATKRead, Iterable<GATKVariant>> join(final JavaRDD<GATKRead> reads, final List<String> variantsFileNames) { return reads.mapPartitionsToPair((PairFlatMapFunction<Iterator<GATKRead>, GATKRead, Iterable<GATKVariant>>) gatkReadIterator -> { List<FeatureDataSource<VariantContext>> variantSources = variantsFileNames.stream().map(fileName -> openFeatureSource(SparkFiles.get(fileName))).collect(Collectors.toList()); Iterator<Tuple2<GATKRead, Iterable<GATKVariant>>> iterator = Iterators.transform(gatkReadIterator, read -> getVariantsOverlappingRead(read, variantSources)); return new CloseAtEndIterator<>(iterator, new AutoCloseableCollection(variantSources)); // close FeatureDataSource at end of iteration }); }
Example 5
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static <T> void persistsPartition(JavaRDD<MessageAndMetadata<T>> rdd, Properties props) throws Exception { JavaPairRDD<String,Long> partitionOffsetRdd = rdd.mapPartitionsToPair(new PartitionOffsetPair<>()); JavaPairRDD<String, Iterable<Long>> partitonOffset = partitionOffsetRdd.groupByKey(1); List<Tuple2<String, Iterable<Long>>> poList = partitonOffset.collect(); doPersists(poList, props); }
Example 6
Source File: PSScorer.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
public JavaRDD<GATKRead> scoreReads(final JavaSparkContext ctx, final JavaRDD<GATKRead> pairedReads, final JavaRDD<GATKRead> unpairedReads, final SAMFileHeader header) { //Group reads into pairs final JavaRDD<Iterable<GATKRead>> groupedReads = groupReadsIntoPairs(pairedReads, unpairedReads, scoreArgs.readsPerPartitionEstimate); //Load taxonomy database, created by running PathSeqBuildReferenceTaxonomy with this reference final PSTaxonomyDatabase taxDB = readTaxonomyDatabase(scoreArgs.taxonomyDatabasePath); final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxDB); //Check header against database if (scoreArgs.headerWarningFile != null) { writeMissingReferenceAccessions(scoreArgs.headerWarningFile, header, taxDB, logger); } //Determine which alignments are valid hits and return their tax IDs in PSPathogenAlignmentHit //Also adds pathseq tags containing the hit IDs to the reads final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> readHits = mapGroupedReadsToTax(groupedReads, scoreArgs.minIdentity, scoreArgs.identityMargin, taxonomyDatabaseBroadcast); //Get the original reads, now with their pathseq hit tags set final JavaRDD<GATKRead> readsFinal = flattenIterableKeys(readHits); //Compute taxonomic scores from the alignment hits final JavaRDD<PSPathogenAlignmentHit> alignmentHits = readHits.map(Tuple2::_2); final boolean divideByGenomeLength = scoreArgs.divideByGenomeLength; //To prevent serialization of PSScorer final JavaPairRDD<Integer, PSPathogenTaxonScore> taxScoresRdd = alignmentHits .mapPartitionsToPair(iter -> computeTaxScores(iter, taxonomyDatabaseBroadcast.value(), divideByGenomeLength)); //Reduce scores by taxon and compute normalized scores Map<Integer, PSPathogenTaxonScore> taxScoresMap = new HashMap<>(taxScoresRdd.reduceByKey(PSPathogenTaxonScore::add).collectAsMap()); taxScoresMap = computeNormalizedScores(taxScoresMap, taxDB.tree, scoreArgs.notNormalizedByKingdom); //Write scores to file writeScoresFile(taxScoresMap, taxDB.tree, scoreArgs.scoresPath); return readsFinal; }