Java Code Examples for org.apache.spark.api.java.JavaRDD#collect()

The following examples show how to use org.apache.spark.api.java.JavaRDD#collect() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DependencyParser.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parses all sentences in an input file, each on a line and writes the result to 
 * the console window containing flattened dependency tuples.
 * @param jsc
 * @param inputFileName
 */
public void parse(JavaSparkContext jsc, String inputFileName) {
	List<String> sentences = jsc.textFile(inputFileName).collect();
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<String> rows = graphs.map(new Function<DependencyGraph, String>() {
		private static final long serialVersionUID = -6021310762521034121L;

		public String call(DependencyGraph graph) {
			return graph.dependencies();
		}
	});
	for (String s : rows.collect()) {
		System.out.println(s);
	}
}
 
Example 2
Source File: SparkExport.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public static void exportCSVLocal(String outputDir, String baseFileName, int numFiles, String delimiter,
                String quote, JavaRDD<List<Writable>> data, int rngSeed) throws Exception {

    JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote));
    double[] split = new double[numFiles];
    for (int i = 0; i < split.length; i++)
        split[i] = 1.0 / numFiles;
    JavaRDD<String>[] splitData = lines.randomSplit(split);

    int count = 0;
    Random r = new Random(rngSeed);
    for (JavaRDD<String> subset : splitData) {
        String path = FilenameUtils.concat(outputDir, baseFileName + (count++) + ".csv");
        List<String> linesList = subset.collect();
        if (!(linesList instanceof ArrayList))
            linesList = new ArrayList<>(linesList);
        Collections.shuffle(linesList, r);
        FileUtils.writeLines(new File(path), linesList);
    }
}
 
Example 3
Source File: SparkExport.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public static void exportCSVLocal(String outputDir, String baseFileName, int numFiles, String delimiter,
                String quote, JavaRDD<List<Writable>> data) throws Exception {

    JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote));
    double[] split = new double[numFiles];
    for (int i = 0; i < split.length; i++)
        split[i] = 1.0 / numFiles;
    JavaRDD<String>[] splitData = lines.randomSplit(split);

    int count = 0;
    for (JavaRDD<String> subset : splitData) {
        String path = FilenameUtils.concat(outputDir, baseFileName + (count++) + ".csv");
        //            subset.saveAsTextFile(path);
        List<String> linesList = subset.collect();
        FileUtils.writeLines(new File(path), linesList);
    }
}
 
Example 4
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test @Ignore   //AB 2020/04/20 https://github.com/eclipse/deeplearning4j/issues/8849
public void testCountCumSum() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();

    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    List<Long> sentenceCountCumSumList = sentenceCountCumSumRDD.collect();
    assertTrue(sentenceCountCumSumList.get(0) == 6L);
    assertTrue(sentenceCountCumSumList.get(1) == 9L);

    sc.stop();
}
 
Example 5
Source File: SparkKafkaDataSetWriter.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public DataSet<ExecRow> write() throws StandardException{
    long start = System.currentTimeMillis();

    CountFunction countFunction = new CountFunction<>();
    KafkaStreamer kafkaStreamer = new KafkaStreamer(rdd.getNumPartitions(), topicName);
    JavaRDD streamed = rdd.map(countFunction).mapPartitionsWithIndex(kafkaStreamer, true);
    streamed.collect();

    Long count = countFunction.getCount().value();
    if(count == 0) {
        try {
            kafkaStreamer.noData();
        } catch(Exception e) {
            throw StandardException.newException("", e);
        }
    }

    long end = System.currentTimeMillis();
    ValueRow valueRow=new ValueRow(2);
    valueRow.setColumn(1,new SQLLongint(count));
    valueRow.setColumn(2,new SQLLongint(end-start));
    return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
}
 
Example 6
Source File: HaplotypeCallerSpark.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * WriteVariants, this is currently going to be horribly slow and explosive on a full size file since it performs a collect.
 *
 * This will be replaced by a parallel writer similar to what's done with {@link org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink}
 */
private void writeVariants(JavaRDD<VariantContext> variants) {
    final List<VariantContext> collectedVariants = variants.collect();
    final SAMSequenceDictionary referenceDictionary = getReferenceSequenceDictionary();

    final List<VariantContext> sortedVariants = collectedVariants.stream()
            .sorted((o1, o2) -> IntervalUtils.compareLocatables(o1, o2, referenceDictionary))
            .collect(Collectors.toList());

    final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgs, getHeaderForReads(), new ReferenceMultiSourceAdapter(getReference(), getAuthHolder()));
    try(final VariantContextWriter writer = hcEngine.makeVCFWriter(output, getBestAvailableSequenceDictionary())) {
        hcEngine.writeHeader(writer, getHeaderForReads().getSequenceDictionary(), Collections.emptySet());
        sortedVariants.forEach(writer::add);
    }
}
 
Example 7
Source File: MapPartitionsWithIndex.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void mapPartitionsWithIndex(JavaSparkContext sc) {

		List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

		// 初始化,分为3个分区
		JavaRDD<String> namesRDD = sc.parallelize(names, 3);
		JavaRDD<String> mapPartitionsWithIndexRDD = namesRDD
				.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

					private static final long serialVersionUID = 1L;

					public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
						List<String> list = new ArrayList<String>();
						while (v2.hasNext()) {
							list.add("分区索引:" + v1 + "\t" + v2.next());
						}
						return list.iterator();
					}
				}, true);

		// 从集群获取数据到本地内存中
		List<String> result = mapPartitionsWithIndexRDD.collect();
		for (String s : result) {
			System.out.println(s);
		}

		sc.close();
	}
 
Example 8
Source File: PersistenceRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 外部集合转成RDD
 */
@Test
public void testParallelize() {
    List<String> stringList = Arrays.asList("1", "2", "3", "4", "5");
    JavaRDD<String> parallelize = sparkContext.parallelize(stringList);
    List<String> collect = parallelize.collect();
    checkResult(collect);
}
 
Example 9
Source File: ReadsSparkSourceUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private void doLoadReads(String bam, String referencePathName, ValidationStringency validationStringency) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    final GATKPath inputBamSpecifier = new GATKPath(bam);
    final GATKPath referenceInputPath = referencePathName == null ? null : new GATKPath(referencePathName);
    ReadsSparkSource readSource = new ReadsSparkSource(ctx, validationStringency);
    JavaRDD<GATKRead> rddSerialReads = getSerialReads(ctx, bam, referenceInputPath, validationStringency);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBamSpecifier, referenceInputPath);

    List<GATKRead> serialReads = rddSerialReads.collect();
    List<GATKRead> parallelReads = rddParallelReads.collect();
    Assert.assertEquals(serialReads.size(), parallelReads.size());
}
 
Example 10
Source File: TestHoodieClientOnCopyOnWriteStorage.java    From hudi with Apache License 2.0 5 votes vote down vote up
private Pair<Path, JavaRDD<WriteStatus>> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime)
    throws Exception {
  HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false)
      .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true)
          .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).build())
      .build();
  HoodieWriteClient client = getHoodieWriteClient(cfg);

  client.startCommitWithTime(instantTime);
  JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(dataGen.generateInserts(instantTime, 200), 1);
  JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, instantTime);
  result.collect();

  // Create a dummy marker file to simulate the case that a marker file was created without data file.
  // This should fail the commit
  String partitionPath = Arrays
      .stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", metaClient.getMarkerFolderPath(instantTime))),
          path -> path.toString().endsWith(HoodieTableMetaClient.MARKER_EXTN)))
      .limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0);
  Path markerFilePath = new Path(String.format("%s/%s", partitionPath,
      FSUtils.makeMarkerFile(instantTime, "1-0-1", UUID.randomUUID().toString())));
  metaClient.getFs().create(markerFilePath);
  LOG.info("Created a dummy marker path=" + markerFilePath);

  Exception e = assertThrows(HoodieCommitException.class, () -> {
    client.commit(instantTime, result);
  }, "Commit should fail due to consistency check");
  assertTrue(e.getCause() instanceof HoodieIOException);
  return Pair.of(markerFilePath, result);
}
 
Example 11
Source File: ReadsSparkSourceUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "loadShardedReads", groups = "spark")
public void shardedReadsSparkSourceTest(String expectedBam, String shardedBam, String referencePath) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final GATKPath referenceInputPath = referencePath == null ? null : new GATKPath(referencePath);
    final GATKPath shardedBamSpecifier = new GATKPath(shardedBam);

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddSerialReads = getSerialReads(ctx, expectedBam, referenceInputPath, ReadConstants.DEFAULT_READ_VALIDATION_STRINGENCY);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(shardedBamSpecifier, referenceInputPath);

    List<GATKRead> serialReads = rddSerialReads.collect();
    List<GATKRead> parallelReads = rddParallelReads.collect();
    Assert.assertEquals(parallelReads.size(), serialReads.size());
}
 
Example 12
Source File: MiniBatchTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testMiniBatches() throws Exception {
    log.info("Setting up Spark Context...");
    JavaRDD<String> lines = sc.textFile(new ClassPathResource("svmLight/iris_svmLight_0.txt")
                    .getTempFileFromArchive().toURI().toString()).cache();
    long count = lines.count();
    assertEquals(300, count);
    // gotta map this to a Matrix/INDArray
    RecordReader rr = new SVMLightRecordReader();
    Configuration c = new Configuration();
    c.set(SVMLightRecordReader.NUM_FEATURES, "5");
    rr.setConf(c);
    JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(rr, 4, 3)).cache();
    count = points.count();
    assertEquals(300, count);

    List<DataSet> collect = points.collect();

    points = points.repartition(1);
    JavaRDD<DataSet> miniBatches = new RDDMiniBatches(10, points).miniBatchesJava();
    count = miniBatches.count();
    List<DataSet> list = miniBatches.collect();
    assertEquals(30, count);    //Expect exactly 30 from 1 partition... could be more for multiple input partitions

    lines.unpersist();
    points.unpersist();
    miniBatches.map(new DataSetAssertionFunction());
}
 
Example 13
Source File: LogisticRegressionBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example 14
Source File: TestExport.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void testBatchAndExportDataSetsFunction() throws Exception {
        String baseDir = System.getProperty("java.io.tmpdir");
        baseDir = FilenameUtils.concat(baseDir, "dl4j_spark_testBatchAndExport/");
        baseDir = baseDir.replaceAll("\\\\", "/");
        File f = new File(baseDir);
        if (f.exists())
            FileUtils.deleteDirectory(f);
        f.mkdir();
        f.deleteOnExit();
        int minibatchSize = 5;
        int nIn = 4;
        int nOut = 3;

        List<DataSet> dataSets = new ArrayList<>();
        dataSets.add(new DataSet(Nd4j.create(10, nIn), Nd4j.create(10, nOut))); //Larger than minibatch size -> tests splitting
        for (int i = 0; i < 98; i++) {
            if (i % 2 == 0) {
                dataSets.add(new DataSet(Nd4j.create(5, nIn), Nd4j.create(5, nOut)));
            } else {
                dataSets.add(new DataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut)));
                dataSets.add(new DataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut)));
                dataSets.add(new DataSet(Nd4j.create(3, nIn), Nd4j.create(3, nOut)));
            }
        }

        Collections.shuffle(dataSets, new Random(12345));

        JavaRDD<DataSet> rdd = sc.parallelize(dataSets);
        rdd = rdd.repartition(1); //For testing purposes (should get exactly 100 out, but maybe more with more partitions)


        JavaRDD<String> pathsRdd = rdd.mapPartitionsWithIndex(
                        new BatchAndExportDataSetsFunction(minibatchSize, "file:///" + baseDir), true);

        List<String> paths = pathsRdd.collect();
        assertEquals(100, paths.size());

        File[] files = f.listFiles();
        assertNotNull(files);

        int count = 0;
        for (File file : files) {
            if (!file.getPath().endsWith(".bin"))
                continue;
//            System.out.println(file);
            DataSet ds = new DataSet();
            ds.load(file);
            assertEquals(minibatchSize, ds.numExamples());

            count++;
        }

        assertEquals(100, count);

        FileUtils.deleteDirectory(f);
    }
 
Example 15
Source File: Tokenizer.java    From vn.vitk with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Tokenizes a text file and returns a list of tokens.
 * @param fileName
 * @return a list of tokens.
 */
public List<String> tokenize(String fileName) {
	JavaRDD<String> input = readTextFile(fileName);
	JavaRDD<String> output = tokenize(input);
	return output.collect();
}
 
Example 16
Source File: RoughAlignmentClient.java    From render with GNU General Public License v2.0 4 votes vote down vote up
private void alignTier()
        throws IOException {

    LOG.info("alignTier: entry");

    final List<HierarchicalStack> stacksToAlign = new ArrayList<>();

    if (! parameters.keepExisting(PipelineStep.ALIGN) || tierZeroStack.requiresAlignment()) {
        stacksToAlign.add(tierZeroStack);
        tierZeroStack.setAlignmentQuality(null);
    }

    if (stacksToAlign.size() == 1) {

        // broadcast EM_aligner tool to ensure that solver is run serially on each node
        final EMAlignerTool solver = new EMAlignerTool(new File(parameters.solverScript),
                                                       new File(parameters.solverParametersTemplate));
        final Broadcast<EMAlignerTool> broadcastEMAlignerTool = sparkContext.broadcast(solver);

        final HierarchicalTierSolveFunction solveStacksFunction =
                new HierarchicalTierSolveFunction(parameters.renderWeb.baseDataUrl,
                                                  parameters.zNeighborDistance,
                                                  broadcastEMAlignerTool);

        // remove any pre-existing alignment results ...
        driverTierRender.deleteStack(tierZeroStack.getAlignedStackId().getStack(), null);

        final JavaRDD<HierarchicalStack> rddTierStacksToAlign = sparkContext.parallelize(stacksToAlign);

        final JavaRDD<HierarchicalStack> rddTierStacksAfterAlignment =
                rddTierStacksToAlign.map(solveStacksFunction);

        final List<HierarchicalStack> tierStacksAfterAlignment = rddTierStacksAfterAlignment.collect();

        LOG.info("alignTier: processing results");

        final Double alignmentQuality = tierStacksAfterAlignment.get(0).getAlignmentQuality();

        if ((alignmentQuality == null) || (alignmentQuality < 0.0)) {
            throw new IOException("alignment of " + tierZeroStack.getSplitStackId() +
                                  " failed (alignment quality is " + alignmentQuality + ")");
        }

        tierZeroStack.setAlignmentQuality(alignmentQuality);
        persistHierarchicalData(tierZeroStack);

        LOG.info("alignTier: {} has alignment quality {}",
                 tierZeroStack.getAlignedStackId(), tierZeroStack.getAlignmentQuality());

    } else {
        LOG.info("alignTier: all aligned stacks have already been generated");
    }

    LOG.info("alignTier: exit");
}
 
Example 17
Source File: TransformSectionClient.java    From render with GNU General Public License v2.0 4 votes vote down vote up
public void run()
            throws IOException, URISyntaxException {

        final SparkConf conf = new SparkConf().setAppName("TransformSectionClient");
        final JavaSparkContext sparkContext = new JavaSparkContext(conf);

        final String sparkAppId = sparkContext.getConf().getAppId();
        final String executorsJson = LogUtilities.getExecutorsApiJson(sparkAppId);

        LOG.info("run: appId is {}, executors data is {}", sparkAppId, executorsJson);


        final RenderDataClient sourceDataClient = parameters.renderWeb.getDataClient();

        final List<Double> zValues = sourceDataClient.getStackZValues(parameters.stack,
                                                                      parameters.layerRange.minZ,
                                                                      parameters.layerRange.maxZ);

        if (zValues.size() == 0) {
            throw new IllegalArgumentException("source stack does not contain any matching z values");
        }

        final RenderDataClient targetDataClient = new RenderDataClient(parameters.renderWeb.baseDataUrl,
                                                                       parameters.getTargetOwner(),
                                                                       parameters.getTargetProject());

        final StackMetaData sourceStackMetaData = sourceDataClient.getStackMetaData(parameters.stack);
        targetDataClient.setupDerivedStack(sourceStackMetaData, parameters.getTargetStack());

        final LeafTransformSpec stackTransform = new LeafTransformSpec(parameters.transformId,
                                                                       null,
                                                                       parameters.transformClass,
                                                                       parameters.transformData.replace(',', ' '));
//      make RDD
        final JavaRDD<Double> rddZValues = sparkContext.parallelize(zValues);

        final Function<Double, Integer> transformFunction = (Function<Double, Integer>) z -> {

            LogUtilities.setupExecutorLog4j("z " + z);
            //get the source client
            final RenderDataClient sourceDataClient1 = parameters.renderWeb.getDataClient();

            //get the target client(which can be the same as the source)
            final RenderDataClient targetDataClient1 = new RenderDataClient(parameters.renderWeb.baseDataUrl,
                                                                            parameters.getTargetOwner(),
                                                                            parameters.getTargetProject());

            final ResolvedTileSpecCollection sourceCollection =
                    sourceDataClient1.getResolvedTiles(parameters.stack, z);

            sourceCollection.addTransformSpecToCollection(stackTransform);
            sourceCollection.addReferenceTransformToAllTiles(stackTransform.getId(), false);

            //vs tile spec validation?
            sourceCollection.removeUnreferencedTransforms();

            targetDataClient1.saveResolvedTiles(sourceCollection, parameters.getTargetStack(), z);

            return sourceCollection.getTileCount();
        };

        // assign a transformation to the RDD
        final JavaRDD<Integer> rddTileCounts = rddZValues.map(transformFunction);

        // use an action to get the results
        final List<Integer> tileCountList = rddTileCounts.collect();
        long total = 0;

        for (final Integer tileCount : tileCountList) {
            total += tileCount;
        }

        LOG.info("run: collected stats");
        LOG.info("run: saved {} tiles and transforms", total);

        sparkContext.stop();
    }
 
Example 18
Source File: TestIteratorUtils.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testRRMDSIJoin() throws Exception {

    ClassPathResource cpr1 = new ClassPathResource("spark/rrmdsi/file1.txt");
    ClassPathResource cpr2 = new ClassPathResource("spark/rrmdsi/file2.txt");

    RecordReader rr1 = new CSVRecordReader();
    rr1.initialize(new FileSplit(cpr1.getFile()));
    RecordReader rr2 = new CSVRecordReader();
    rr2.initialize(new FileSplit(cpr2.getFile()));

    RecordReaderMultiDataSetIterator rrmdsi1 = new RecordReaderMultiDataSetIterator.Builder(1)
            .addReader("r1", rr1)
            .addReader("r2", rr2)
            .addInput("r1", 1, 2)
            .addOutput("r2",1,2)
            .build();

    RecordReaderMultiDataSetIterator rrmdsi2 = new RecordReaderMultiDataSetIterator.Builder(1)
            .addReader("r1", new SparkSourceDummyReader(0))
            .addReader("r2", new SparkSourceDummyReader(1))
            .addInput("r1", 1, 2)
            .addOutput("r2",1,2)
            .build();

    List<MultiDataSet> expected = new ArrayList<>(3);
    while(rrmdsi1.hasNext()){
        expected.add(rrmdsi1.next());
    }

    JavaRDD<List<Writable>> rdd1 = sc.textFile(cpr1.getFile().getPath()).coalesce(1)
            .map(new StringToWritablesFunction(new CSVRecordReader()));
    JavaRDD<List<Writable>> rdd2 = sc.textFile(cpr2.getFile().getPath()).coalesce(1)
            .map(new StringToWritablesFunction(new CSVRecordReader()));

    List<JavaRDD<List<Writable>>> list = Arrays.asList(rdd1, rdd2);
    JavaRDD<MultiDataSet> mdsRdd = IteratorUtils.mapRRMDSI(list, null, new int[]{0,0}, null, false, rrmdsi2);

    List<MultiDataSet> act = mdsRdd.collect();


    expected = new ArrayList<>(expected);
    act = new ArrayList<>(act);
    Comparator<MultiDataSet> comp = new Comparator<MultiDataSet>() {
        @Override
        public int compare(MultiDataSet d1, MultiDataSet d2) {
            return Double.compare(d1.getFeatures(0).getDouble(0), d2.getFeatures(0).getDouble(0));
        }
    };

    Collections.sort(expected, comp);
    Collections.sort(act, comp);

    assertEquals(expected, act);
}
 
Example 19
Source File: TestSequenceRecordReaderBytesFunction.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    ClassPathResource cpr = new ClassPathResource("/video/shapes_0.mp4");
    String path = cpr.getFile().getAbsolutePath();
    String folder = path.substring(0, path.length() - 12);
    path = folder + "*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(new File(folder), new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}
 
Example 20
Source File: TestAsyncCompaction.java    From hudi with Apache License 2.0 4 votes vote down vote up
private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, final HoodieReadClient readClient, List<String> deltaInstants,
                                               List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants)
    throws Exception {

  HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath());
  List<Pair<String, HoodieCompactionPlan>> pendingCompactions = readClient.getPendingCompactions();
  List<String> gotPendingCompactionInstants =
      pendingCompactions.stream().map(pc -> pc.getKey()).sorted().collect(Collectors.toList());
  assertEquals(expPendingCompactionInstants, gotPendingCompactionInstants);

  Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToCompactionOperation =
      CompactionUtils.getAllPendingCompactionOperations(metaClient);

  if (insertFirst) {
    // Use first instant for inserting records
    String firstInstant = deltaInstants.get(0);
    deltaInstants = deltaInstants.subList(1, deltaInstants.size());
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    client.startCommitWithTime(firstInstant);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, firstInstant);
    List<WriteStatus> statusList = statuses.collect();

    if (!cfg.shouldAutoCommit()) {
      client.commit(firstInstant, statuses);
    }
    assertNoWriteErrors(statusList);
    metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath());
    HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
    List<HoodieBaseFile> dataFilesToRead = getCurrentLatestDataFiles(hoodieTable, cfg);
    assertTrue(dataFilesToRead.stream().findAny().isPresent(),
        "should list the parquet files we wrote in the delta commit");
    validateDeltaCommit(firstInstant, fgIdToCompactionOperation, cfg);
  }

  int numRecords = records.size();
  for (String instantTime : deltaInstants) {
    records = dataGen.generateUpdates(instantTime, numRecords);
    metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath());
    createNextDeltaCommit(instantTime, records, client, metaClient, cfg, false);
    validateDeltaCommit(instantTime, fgIdToCompactionOperation, cfg);
  }
  return records;
}