Java Code Examples for org.apache.spark.api.java.JavaRDD#mapPartitionsWithIndex()

The following examples show how to use org.apache.spark.api.java.JavaRDD#mapPartitionsWithIndex() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamingProcessor.java    From lambda-arch with Apache License 2.0 6 votes vote down vote up
private JavaRDD<IoTData> getEnhancedObjWithKafkaInfo(JavaRDD<ConsumerRecord<String, IoTData>> item) {
    OffsetRange[] offsetRanges = ((HasOffsetRanges) item.rdd()).offsetRanges();

    return item.mapPartitionsWithIndex((index, items) -> {
        Map<String, String> meta = new HashMap<String, String>() {{
            int partition = offsetRanges[index].partition();
            long from = offsetRanges[index].fromOffset();
            long until = offsetRanges[index].untilOffset();

            put("topic", offsetRanges[index].topic());
            put("fromOffset", "" + from);
            put("kafkaPartition", "" + partition);
            put("untilOffset", "" + until);
        }};
        List<IoTData> list = new ArrayList<>();
        while (items.hasNext()) {
            ConsumerRecord<String, IoTData> next = items.next();
            IoTData dataItem = next.value();
            meta.put("dayOfWeek", "" + dataItem.getTimestamp().toLocalDate().getDayOfWeek().getValue());
            dataItem.setMetaData(meta);
            list.add(dataItem);
        }
        return list.iterator();
    }, true);
}
 
Example 2
Source File: MarkDuplicatesSparkUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Method that ensures the reads are grouped together keyed by their readname groups with indexpairs represeting the source partition.
 * If the bam is querygrouped/queryname sorted then it calls spanReadsByKey to perform the mapping operation
 * If the bam is sorted in some other way it performs a groupBy operation on the key
 */
private static JavaPairRDD<String, Iterable<IndexPair<GATKRead>>> getReadsGroupedByName(SAMFileHeader header, JavaRDD<GATKRead> reads, int numReducers) {

    final JavaPairRDD<String, Iterable<IndexPair<GATKRead>>> keyedReads;
    final JavaRDD<IndexPair<GATKRead>> indexedReads = reads.mapPartitionsWithIndex(
            (index, iter) -> Utils.stream(iter).map(read -> {
                if (!(read.getClass() == SAMRecordToGATKReadAdapter.class)) {
                    throw new GATKException(String.format("MarkDuplicatesSpark currently only supports SAMRecords as an underlying reads data source class, %s found instead",
                            read.getClass().toString()));
                }
                return new IndexPair<>(read, index);}).iterator(), false);
    if (ReadUtils.isReadNameGroupedBam(header)) {
        // reads are already grouped by name, so perform grouping within the partition (no shuffle)
        keyedReads = spanReadsByKey(indexedReads);
    } else {
        // sort by group and name (incurs a shuffle)
        throw new GATKException(String.format("MarkDuplicatesSparkUtils.mark() requires input reads to be queryname sorted or querygrouped, yet the header indicated it was in %s order instead", header.getSortOrder()));
    }
    return keyedReads;
}
 
Example 3
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Equivalent to {@link #balancedRandomSplit(int, int, JavaRDD)} with control over the RNG seed
 */
public static <T> JavaRDD<T>[] balancedRandomSplit(int totalObjectCount, int numObjectsPerSplit, JavaRDD<T> data,
                long rngSeed) {
    JavaRDD<T>[] splits;
    if (totalObjectCount <= numObjectsPerSplit) {
        splits = (JavaRDD<T>[]) Array.newInstance(JavaRDD.class, 1);
        splits[0] = data;
    } else {
        int numSplits = totalObjectCount / numObjectsPerSplit; //Intentional round down
        splits = (JavaRDD<T>[]) Array.newInstance(JavaRDD.class, numSplits);
        for (int i = 0; i < numSplits; i++) {
            splits[i] = data.mapPartitionsWithIndex(new SplitPartitionsFunction<T>(i, numSplits, rngSeed), true);
        }

    }
    return splits;
}
 
Example 4
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的:算平方。(参数1是分区的索引)
 *
 * @since hui_project 1.0.0
 */
public void testMapPartitionsWithIndex() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize.mapPartitionsWithIndex((x, y) -> getSquareWithIndex(x, y), false);
    checkResult(rdd.collect());
}
 
Example 5
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的:算平方。(参数1是分区的索引)
 *
 * @since hui_project 1.0.0
 */
@Test
public void testMapPartitionsWithIndex(){
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize.mapPartitionsWithIndex((x, y) -> getSquareWithIndex(x, y), false);
    checkResult(rdd.collect());
}
 
Example 6
Source File: MapPartitionsWithIndex.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void mapPartitionsWithIndex(JavaSparkContext sc) {

		List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

		// 初始化,分为3个分区
		JavaRDD<String> namesRDD = sc.parallelize(names, 3);
		JavaRDD<String> mapPartitionsWithIndexRDD = namesRDD
				.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

					private static final long serialVersionUID = 1L;

					public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
						List<String> list = new ArrayList<String>();
						while (v2.hasNext()) {
							list.add("分区索引:" + v1 + "\t" + v2.next());
						}
						return list.iterator();
					}
				}, true);

		// 从集群获取数据到本地内存中
		List<String> result = mapPartitionsWithIndexRDD.collect();
		for (String s : result) {
			System.out.println(s);
		}

		sc.close();
	}
 
Example 7
Source File: SparkTrainWorker.java    From ytk-learn with MIT License 5 votes vote down vote up
public boolean sparkTrain(JavaRDD<String> rdd) {
    JavaRDD<String> repartition = rdd.repartition(slaveNum);
    JavaRDD<Boolean> partRDD = repartition.mapPartitionsWithIndex(trainFunc, true);
    List<Boolean> res = partRDD.collect();
    for (boolean result : res) {
        if (!result) {
            return false;
        }
    }
    return true;
}
 
Example 8
Source File: HBaseIndex.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
    HoodieTable<T> hoodieTable) {
  final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
  setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc);
  LOG.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
  JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(updateLocationFunction(), true);
  // caching the index updated status RDD
  writeStatusJavaRDD = writeStatusJavaRDD.persist(SparkConfigUtils.getWriteStatusStorageLevel(config.getProps()));
  return writeStatusJavaRDD;
}
 
Example 9
Source File: BaseTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected String export(JavaRDD<DataSet> trainingData) {
    String baseDir = getBaseDirForRDD(trainingData);
    String dataDir = baseDir + "data/";
    String pathsDir = baseDir + "paths/";

    log.info("Initiating RDD<DataSet> export at {}", baseDir);
    JavaRDD<String> paths = trainingData
                    .mapPartitionsWithIndex(new BatchAndExportDataSetsFunction(batchSizePerWorker, dataDir), true);
    paths.saveAsTextFile(pathsDir);
    log.info("RDD<DataSet> export complete at {}", baseDir);

    lastExportedRDDId = trainingData.id();
    lastRDDExportPath = baseDir;
    return baseDir;
}
 
Example 10
Source File: BaseTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected String exportMDS(JavaRDD<MultiDataSet> trainingData) {
    String baseDir = getBaseDirForRDD(trainingData);
    String dataDir = baseDir + "data/";
    String pathsDir = baseDir + "paths/";

    log.info("Initiating RDD<MultiDataSet> export at {}", baseDir);
    JavaRDD<String> paths = trainingData.mapPartitionsWithIndex(
                    new BatchAndExportMultiDataSetsFunction(batchSizePerWorker, dataDir), true);
    paths.saveAsTextFile(pathsDir);
    log.info("RDD<MultiDataSet> export complete at {}", baseDir);

    lastExportedRDDId = trainingData.id();
    lastRDDExportPath = baseDir;
    return baseDir;
}
 
Example 11
Source File: HBaseIndex.java    From hudi with Apache License 2.0 4 votes vote down vote up
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
    HoodieTable<T> hoodieTable) {
  return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true);
}
 
Example 12
Source File: InMemoryHashIndex.java    From hudi with Apache License 2.0 4 votes vote down vote up
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
    HoodieTable<T> hoodieTable) {
  return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true);
}
 
Example 13
Source File: BayesianNetworkSampler.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public DataSpark sampleToDataSpark(JavaSparkContext sc, int nSamples, int parallelism) {

            int localNSamples = nSamples/parallelism;

            JavaRDD<Integer> partitions = sc.parallelize(Arrays.asList(new Integer[parallelism]), parallelism);

            Function2 getPartitionSample = new Function2<Integer, Iterator<Integer>, Iterator<DataInstance>>(){
                @Override
                public Iterator<DataInstance> call(Integer ind, Iterator<Integer> iterator) throws Exception {
                    localSampler.setSeed(seed+ind);
                    return localSampler.sampleToDataStream(localNSamples).iterator();
                }
            };

            JavaRDD<DataInstance> sampleRDD = partitions.mapPartitionsWithIndex(getPartitionSample, false);

            // Get the attributes from a local instance
            Attributes attributes = this.localSampler.sampleToDataStream(1).getAttributes();

            return new DataSparkFromRDD(sampleRDD, attributes);

        }
 
Example 14
Source File: TestExport.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void testBatchAndExportDataSetsFunction() throws Exception {
        String baseDir = System.getProperty("java.io.tmpdir");
        baseDir = FilenameUtils.concat(baseDir, "dl4j_spark_testBatchAndExport/");
        baseDir = baseDir.replaceAll("\\\\", "/");
        File f = new File(baseDir);
        if (f.exists())
            FileUtils.deleteDirectory(f);
        f.mkdir();
        f.deleteOnExit();
        int minibatchSize = 5;
        int nIn = 4;
        int nOut = 3;

        List<DataSet> dataSets = new ArrayList<>();
        dataSets.add(new DataSet(Nd4j.create(10, nIn), Nd4j.create(10, nOut))); //Larger than minibatch size -> tests splitting
        for (int i = 0; i < 98; i++) {
            if (i % 2 == 0) {
                dataSets.add(new DataSet(Nd4j.create(5, nIn), Nd4j.create(5, nOut)));
            } else {
                dataSets.add(new DataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut)));
                dataSets.add(new DataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut)));
                dataSets.add(new DataSet(Nd4j.create(3, nIn), Nd4j.create(3, nOut)));
            }
        }

        Collections.shuffle(dataSets, new Random(12345));

        JavaRDD<DataSet> rdd = sc.parallelize(dataSets);
        rdd = rdd.repartition(1); //For testing purposes (should get exactly 100 out, but maybe more with more partitions)


        JavaRDD<String> pathsRdd = rdd.mapPartitionsWithIndex(
                        new BatchAndExportDataSetsFunction(minibatchSize, "file:///" + baseDir), true);

        List<String> paths = pathsRdd.collect();
        assertEquals(100, paths.size());

        File[] files = f.listFiles();
        assertNotNull(files);

        int count = 0;
        for (File file : files) {
            if (!file.getPath().endsWith(".bin"))
                continue;
//            System.out.println(file);
            DataSet ds = new DataSet();
            ds.load(file);
            assertEquals(minibatchSize, ds.numExamples());

            count++;
        }

        assertEquals(100, count);

        FileUtils.deleteDirectory(f);
    }
 
Example 15
Source File: TestExport.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void testBatchAndExportMultiDataSetsFunction() throws Exception {
        String baseDir = System.getProperty("java.io.tmpdir");
        baseDir = FilenameUtils.concat(baseDir, "dl4j_spark_testBatchAndExportMDS/");
        baseDir = baseDir.replaceAll("\\\\", "/");
        File f = new File(baseDir);
        if (f.exists())
            FileUtils.deleteDirectory(f);
        f.mkdir();
        f.deleteOnExit();
        int minibatchSize = 5;
        int nIn = 4;
        int nOut = 3;

        List<MultiDataSet> dataSets = new ArrayList<>();
        dataSets.add(new org.nd4j.linalg.dataset.MultiDataSet(Nd4j.create(10, nIn), Nd4j.create(10, nOut))); //Larger than minibatch size -> tests splitting
        for (int i = 0; i < 98; i++) {
            if (i % 2 == 0) {
                dataSets.add(new org.nd4j.linalg.dataset.MultiDataSet(Nd4j.create(5, nIn), Nd4j.create(5, nOut)));
            } else {
                dataSets.add(new org.nd4j.linalg.dataset.MultiDataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut)));
                dataSets.add(new org.nd4j.linalg.dataset.MultiDataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut)));
                dataSets.add(new org.nd4j.linalg.dataset.MultiDataSet(Nd4j.create(3, nIn), Nd4j.create(3, nOut)));
            }
        }

        Collections.shuffle(dataSets, new Random(12345));

        JavaRDD<MultiDataSet> rdd = sc.parallelize(dataSets);
        rdd = rdd.repartition(1); //For testing purposes (should get exactly 100 out, but maybe more with more partitions)


        JavaRDD<String> pathsRdd = rdd.mapPartitionsWithIndex(
                        new BatchAndExportMultiDataSetsFunction(minibatchSize, "file:///" + baseDir), true);

        List<String> paths = pathsRdd.collect();
        assertEquals(100, paths.size());

        File[] files = f.listFiles();
        assertNotNull(files);

        int count = 0;
        for (File file : files) {
            if (!file.getPath().endsWith(".bin"))
                continue;
//            System.out.println(file);
            MultiDataSet ds = new org.nd4j.linalg.dataset.MultiDataSet();
            ds.load(file);
            assertEquals(minibatchSize, ds.getFeatures(0).size(0));
            assertEquals(minibatchSize, ds.getLabels(0).size(0));

            count++;
        }

        assertEquals(100, count);

        FileUtils.deleteDirectory(f);
    }