Java Code Examples for org.apache.spark.api.java.JavaRDD#map()

The following examples show how to use org.apache.spark.api.java.JavaRDD#map() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}
 
Example 2
Source File: SparkSharder.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static <L extends Locatable, SB extends ShardBoundary> JavaRDD<Shard<L>> shard(JavaSparkContext ctx, JavaRDD<L> locatables, Class<L> locatableClass,
                                                            SAMSequenceDictionary sequenceDictionary, JavaRDD<SB> intervals,
                                                            int maxLocatableLength, boolean useShuffle) {

    JavaRDD<ShardBoundary> paddedIntervals = intervals.map(ShardBoundary::paddedShardBoundary);
    if (useShuffle) {
        throw new UnsupportedOperationException("Shuffle not supported when sharding an RDD of intervals.");
    }
    return joinOverlapping(ctx, locatables, locatableClass, sequenceDictionary, paddedIntervals, maxLocatableLength,
            new MapFunction<Tuple2<ShardBoundary, Iterable<L>>, Shard<L>>() {
                private static final long serialVersionUID = 1L;
                @Override
                public Shard<L> call(Tuple2<ShardBoundary, Iterable<L>> value) {
                    return value._1().createShard(value._2());
                }
            });
}
 
Example 3
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 4
Source File: MarkDuplicatesSparkUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testReadsMissingReadGroups() {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname,
            true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY);
    samRecordSetBuilder.addFrag("READ" , 0, 10000, false);

    JavaRDD<GATKRead> reads = ctx.parallelize(Lists.newArrayList(samRecordSetBuilder.getRecords()), 2).map(SAMRecordToGATKReadAdapter::new);
    reads = reads.map(r -> {r.setReadGroup(null); return r;});
    SAMFileHeader header = samRecordSetBuilder.getHeader();

    try {
        MarkDuplicatesSparkUtils.transformToDuplicateNames(header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, null, reads, 2, false).collect();
        Assert.fail("Should have thrown an exception");
    } catch (Exception e){
        Assert.assertTrue(e instanceof SparkException);
        Assert.assertTrue(e.getCause() instanceof UserException.ReadMissingReadGroup);
    }
}
 
Example 5
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testInputMatrixBlockDML() {
	System.out.println("MLContextTest - input MatrixBlock DML");

	List<String> list = new ArrayList<>();
	list.add("10,20,30");
	list.add("40,50,60");
	list.add("70,80,90");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Matrix m = new Matrix(dataFrame);
	MatrixBlock matrixBlock = m.toMatrixBlock();
	Script script = dml("avg = avg(M);").in("M", matrixBlock).out("avg");
	double avg = ml.execute(script).getDouble("avg");
	Assert.assertEquals(50.0, avg, 0.0);
}
 
Example 6
Source File: InMemoryHashIndex.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
    HoodieTable<T> hoodieTable) {
  return writeStatusRDD.map(new Function<WriteStatus, WriteStatus>() {
    @Override
    public WriteStatus call(WriteStatus writeStatus) {
      for (HoodieRecord record : writeStatus.getWrittenRecords()) {
        if (!writeStatus.isErrored(record.getKey())) {
          HoodieKey key = record.getKey();
          Option<HoodieRecordLocation> newLocation = record.getNewLocation();
          if (newLocation.isPresent()) {
            recordLocationMap.put(key, newLocation.get());
          } else {
            // Delete existing index for a deleted record
            recordLocationMap.remove(key);
          }
        }
      }
      return writeStatus;
    }
  });
}
 
Example 7
Source File: TestLineRecordReaderFunction.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineRecordReader() throws Exception {

    File dataFile = new ClassPathResource("iris.dat").getFile();
    List<String> lines = FileUtils.readLines(dataFile);

    JavaSparkContext sc = getContext();
    JavaRDD<String> linesRdd = sc.parallelize(lines);

    CSVRecordReader rr = new CSVRecordReader(0, ',');

    JavaRDD<List<Writable>> out = linesRdd.map(new LineRecordReaderFunction(rr));
    List<List<Writable>> outList = out.collect();


    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(dataFile));
    Set<List<Writable>> expectedSet = new HashSet<>();
    int totalCount = 0;
    while (rr2.hasNext()) {
        expectedSet.add(rr2.next());
        totalCount++;
    }

    assertEquals(totalCount, outList.size());

    for (List<Writable> line : outList) {
        assertTrue(expectedSet.contains(line));
    }
}
 
Example 8
Source File: EntitySalienceFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 9
Source File: SparkExport.java    From DataVec with Apache License 2.0 5 votes vote down vote up
public static void exportCSVSpark(String directory, String delimiter, String quote, int outputSplits,
                JavaRDD<List<Writable>> data) {

    //NOTE: Order is probably not random here...
    JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote));
    lines.coalesce(outputSplits);

    lines.saveAsTextFile(directory);
}
 
Example 10
Source File: DataFrameOps.java    From toolbox with Apache License 2.0 5 votes vote down vote up
static JavaRDD<DataInstance> toDataInstanceRDD(DataFrame data, Attributes attributes) {

        JavaRDD<double[]> rawRDD = data.rdd()
                                  .toJavaRDD()
                                  .map( row -> transformRow2DataInstance(row, attributes) );

        return rawRDD.map(v ->  new DataInstanceFromDataRow( new DataRowSpark(v, attributes) ) );
    }
 
Example 11
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Converts JavaRDD labeled points to JavaRDD DataSets.
 * @param data JavaRDD LabeledPoints
 * @param numPossibleLabels number of possible labels
 * @param preCache boolean pre-cache rdd before operation
 * @return
 */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels,
                boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return fromLabeledPoint(lp, numPossibleLabels);
        }
    });
}
 
Example 12
Source File: MLContextTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Test
public void testDataFrameToBinaryBlocks() {
	System.out.println("MLContextTest - DataFrame to binary blocks");

	List<String> list = new ArrayList<>();
	list.add("1,2,3");
	list.add("4,5,6");
	list.add("7,8,9");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = MLContextConversionUtil
			.dataFrameToMatrixBinaryBlocks(dataFrame);
	Tuple2<MatrixIndexes, MatrixBlock> first = binaryBlocks.first();
	MatrixBlock mb = first._2();
	double[][] matrix = DataConverter.convertToDoubleMatrix(mb);
	Assert.assertArrayEquals(new double[] { 1.0, 2.0, 3.0 }, matrix[0], 0.0);
	Assert.assertArrayEquals(new double[] { 4.0, 5.0, 6.0 }, matrix[1], 0.0);
	Assert.assertArrayEquals(new double[] { 7.0, 8.0, 9.0 }, matrix[2], 0.0);
}
 
Example 13
Source File: BatchHeatMapProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Converts each row from the iotData  to a Measurement
 *
 * @param iotData | Spark SQL context
 * @return A set containing all data from the CSV file as Measurements
 */
private JavaRDD<Measurement> csvToMeasurements(JavaRDD<IoTData> iotData) {
    JavaRDD<Measurement> map = iotData.map(row -> {
        Coordinate coordinate = new Coordinate(
                Double.valueOf(row.getLatitude()),
                Double.valueOf(row.getLongitude())
        );
        return new Measurement(coordinate, row.getTimestamp());
    });
    return map;
}
 
Example 14
Source File: PileupSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) {
    JavaRDD<String> lines = rdd.map(pileupFunction(metadata, outputInsertLength, showVerbose));
    if (numReducers != 0) {
        lines = lines.coalesce(numReducers);
    }
    lines.saveAsTextFile(outputFile);
}
 
Example 15
Source File: AdaBoostMHLearner.java    From sparkboost with Apache License 2.0 4 votes vote down vote up
protected void updateDistributionMatrix(JavaSparkContext sc, JavaRDD<MultilabelPoint> docs, double[][] localDM, WeakHypothesis localWH) {
    Broadcast<WeakHypothesis> distWH = sc.broadcast(localWH);
    Broadcast<double[][]> distDM = sc.broadcast(localDM);
    JavaRDD<DMPartialResult> partialResults = docs.map(doc -> {
        int[] validFeatures = doc.getFeatures().indices();
        HashMap<Integer, Integer> dictFeatures = new HashMap<>();
        for (int featID : validFeatures)
            dictFeatures.put(featID, featID);
        HashMap<Integer, Integer> dictLabels = new HashMap<>();
        for (int idx = 0; idx < doc.getLabels().length; idx++)
            dictLabels.put(doc.getLabels()[idx], doc.getLabels()[idx]);

        double[][] dm = distDM.getValue();
        WeakHypothesis wh = distWH.getValue();
        double[] labelsRes = new double[dm.length];
        for (int labelID = 0; labelID < dm.length; labelID++) {
            float catValue = 1;
            if (dictLabels.containsKey(labelID)) {
                catValue = -1;
            }

            // Compute the weak hypothesis value.
            double value = 0;
            WeakHypothesis.WeakHypothesisData v = wh.getLabelData(labelID);
            int pivot = v.getFeatureID();
            if (dictFeatures.containsKey(pivot))
                value = v.getC1();
            else
                value = v.getC0();


            double partialRes = dm[labelID][doc.getPointID()] * Math.exp(catValue * value);
            labelsRes[labelID] = partialRes;
        }

        return new DMPartialResult(doc.getPointID(), labelsRes);
    });

    Iterator<DMPartialResult> itResults = partialResults.toLocalIterator();
    // Update partial results.
    double normalization = 0;
    while (itResults.hasNext()) {
        DMPartialResult r = itResults.next();
        for (int labelID = 0; labelID < localDM.length; labelID++) {
            localDM[labelID][r.docID] = r.labelsRes[labelID];
            normalization += localDM[labelID][r.docID];
        }
    }

    // Normalize all values.
    for (int labelID = 0; labelID < localDM.length; labelID++) {
        for (int docID = 0; docID < localDM[0].length; docID++) {
            localDM[labelID][docID] = localDM[labelID][docID] / normalization;
        }
    }
}
 
Example 16
Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator());
    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents
                    .map(s -> {
                        TOTAL_DOCS.add(1);
                        Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class);
                        String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId();
                        tmpLogger.info("Processing document {}.", docId);
                        //Before processing the document through the Disambiguation Pipeline, add the AIDA settings
                        // in each document.
                        SparkUimaUtils.addSettingsToJCas(s.getJCas(),
                                trainingSettings.getDocumentCoherent(),
                                trainingSettings.getDocumentConfidenceThreshold());
                        return ae.process(s);
                    })
                    .flatMap(s -> fe.getTrainingInstances(s.getJCas(),
                            trainingSettings.getFeatureExtractor(),
                            trainingSettings.getPositiveInstanceScalingFactor()));

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entity", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 17
Source File: JavaLatentDirichletAllocationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }
 
Example 18
Source File: HierarchicalAlignmentClient.java    From render with GNU General Public License v2.0 4 votes vote down vote up
private void createWarpStackForTier()
        throws IOException {

    LOG.info("createWarpStackForTier: entry");

    final ProcessTimer timer = new ProcessTimer();

    final Set<StackId> existingRoughProjectStackIds = new HashSet<>(driverRoughRender.getProjectStacks());

    final StackId warpStackId = HierarchicalStack.deriveWarpStackIdForTier(roughTilesStackId, currentTier);

    boolean generateWarpStack = true;
    if (existingRoughProjectStackIds.contains(warpStackId) &&
        parameters.keepExisting(PipelineStep.WARP)) {
        generateWarpStack = false;
    }

    if (generateWarpStack) {

        // remove any existing warp stack results
        driverRoughRender.deleteStack(warpStackId.getStack(), null);

        final StackMetaData roughTilesStackMetaData =
                driverRoughRender.getStackMetaData(roughTilesStackId.getStack());

        driverRoughRender.setupDerivedStack(roughTilesStackMetaData, warpStackId.getStack());

        final String projectForTier = this.tierProject;

        final JavaRDD<Double> rddZValues = sparkContext.parallelize(zValues);
        final HierarchicalWarpFieldStackFunction warpFieldStackFunction
                = new HierarchicalWarpFieldStackFunction(parameters.renderWeb.baseDataUrl,
                                                         parameters.renderWeb.owner,
                                                         currentTier,
                                                         projectForTier,
                                                         tierParentStackId,
                                                         warpStackId.getStack(),
                                                         parameters.consensusBuildMethod);

        final JavaRDD<Integer> rddTileCounts = rddZValues.map(warpFieldStackFunction);

        final List<Integer> tileCountList = rddTileCounts.collect();

        LOG.info("createWarpStackForTier: counting results");

        long total = 0;
        for (final Integer tileCount : tileCountList) {
            total += tileCount;
        }

        LOG.info("createWarpStackForTier: added {} tile specs to {}", total, warpStackId);

        driverRoughRender.setStackState(warpStackId.getStack(), StackMetaData.StackState.COMPLETE);
    }

    LOG.info("createWarpStackForTier: exit, processing took {} seconds", timer.getElapsedSeconds());
}
 
Example 19
Source File: DataFrames.java    From DataVec with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a data frame from a collection of writables
 * rdd given a schema
 *
 * @param schema the schema to use
 * @param data   the data to convert
 * @return the dataframe object
 */
public static DataRowsFacade toDataFrame(Schema schema, JavaRDD<List<Writable>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());
    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.map(new ToRow(schema));
    return dataRows(sqlContext.createDataFrame(rows, fromSchema(schema)));
}
 
Example 20
Source File: DataFrameOps.java    From toolbox with Apache License 2.0 2 votes vote down vote up
static JavaRDD<Row> toRowRDD(JavaRDD<DataInstance> rawRDD, Attributes atts) {

        // FIXME: Categorical values should be inserted with their corresponding state name
        return rawRDD.map( v -> transformArray2RowAttributes(v, atts));

    }