org.apache.spark.rdd.RDD Java Examples

The following examples show how to use org.apache.spark.rdd.RDD. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLMetricsSupporter.java    From DDF with Apache License 2.0 6 votes vote down vote up
@Override
public DDF residuals() throws DDFException {
  SparkDDF predictionDDF = (SparkDDF) this.getDDF();
  JavaRDD<double[]> predictionRDD = predictionDDF.getJavaRDD(double[].class);

  JavaRDD<double[]> result = predictionRDD.map(new MetricsMapperResiduals());

  if (result == null) mLog.error(">> javaRDD result of MetricMapper residuals is null");
  if (predictionDDF.getManager() == null) mLog.error(">> predictionDDF.getManager() is null");
  if (result.rdd() == null) mLog.error(">> result.rdd() is null");
  if (predictionDDF.getSchema() == null) mLog.error(">> predictionDDF.getSchema() is null");
  if (predictionDDF.getName() == null) mLog.error(">> predictionDDF.getName() is null");

  Schema schema = new Schema("residuals double");
  DDFManager manager = this.getDDF().getManager();
  DDF residualDDF = manager
      .newDDF(manager, result.rdd(), new Class<?>[] { RDD.class, double[].class }, null,
          schema);

  if (residualDDF == null) mLog.error(">>>>>>>>>>>.residualDDF is null");

  return residualDDF;
}
 
Example #2
Source File: CollectedGroupConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
    POCollectedGroup physicalOperator) throws IOException {
  SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
  RDD<Tuple> rdd = predecessors.get(0);
  // return predecessors.get(0);
  RDD<Tuple> rdd2 = rdd.coalesce(1, false, null);
  long count = 0;
  try {

    count = rdd2.count();

  } catch (Exception e) {

  }
  CollectedGroupFunction collectedGroupFunction
      = new CollectedGroupFunction(physicalOperator, count);
  return rdd.toJavaRDD().mapPartitions(collectedGroupFunction, true).rdd();
}
 
Example #3
Source File: Evaluation.java    From oryx with Apache License 2.0 6 votes vote down vote up
/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
      testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> testUserProducts =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
  JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
  double mse = predictions.mapToPair(
      rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
  ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
    double diff = valuePrediction._1() - valuePrediction._2();
    return diff * diff;
  }).mean();
  return Math.sqrt(mse);
}
 
Example #4
Source File: MLContextConversionUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code MatrixObject} to a {@code RDD<String>} in IJV format.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @return the {@code MatrixObject} converted to a {@code RDD<String>}
 */
public static RDD<String> matrixObjectToRDDStringIJV(MatrixObject matrixObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = matrixObjectToListStringIJV(matrixObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}
 
Example #5
Source File: DeepSparkContext.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a Cells RDD from HDFS.
 * @param config HDFS ExtractorConfig.
 * @return Cells RDD.
 */
public RDD<Cells> createHDFSRDD(ExtractorConfig<Cells> config) {

    Serializable host = config.getValues().get(ExtractorConstants.HOST);
    Serializable port = config.getValues().get(ExtractorConstants.PORT);
    Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH);

    final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this);

    String filePath = path.toString();
    if (config.getExtractorImplClassName().equals(ExtractorConstants.HDFS)) {
        filePath = ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString();
    }

    return createRDDFromFilePath(filePath, textFileDataTable);
}
 
Example #6
Source File: MLContextConversionUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code FrameObject} to a {@code RDD<String>} in IJV format.
 *
 * @param frameObject
 *            the {@code FrameObject}
 * @return the {@code FrameObject} converted to a {@code RDD<String>}
 */
public static RDD<String> frameObjectToRDDStringIJV(FrameObject frameObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = frameObjectToListStringIJV(frameObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}
 
Example #7
Source File: LoadConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad)
        throws IOException {
    // if (predecessors.size()!=0) {
    // throw new
    // RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
    // }

    JobConf loadJobConf = SparkUtil.newJobConf(pigContext);
    configureLoader(physicalPlan, poLoad, loadJobConf);

    // don't know why but just doing this cast for now
    RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile(
            poLoad.getLFile().getFileName(), PigInputFormatSpark.class,
            Text.class, Tuple.class, loadJobConf);

    registerUdfFiles();
    // map to get just RDD<Tuple>
    return hadoopRDD.map(TO_TUPLE_FUNCTION,
            SparkUtil.getManifest(Tuple.class));
}
 
Example #8
Source File: ExtractorTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Test filter EQ.
 *
 * @param <W> the type parameter
 */
@Test(alwaysRun = true, dependsOnGroups = { "FunctionalTests" })
protected <W> void testFilterEQ() {
    DeepSparkContext context = getDeepSparkContext();
    try {

        Filter[] filters = null;

        Filter filter = new Filter("id", FilterType.EQ, "TestDataSet");
        filters = new Filter[] { filter };
        ExtractorConfig<W> inputConfigEntity2 = getFilterConfig(filters);

        RDD<W> inputRDDEntity2 = context.createRDD(inputConfigEntity2);
        assertEquals(inputRDDEntity2.count(), 1);
    } finally {
        context.stop();
    }

}
 
Example #9
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testRDDGoodMetadataDML() {
	System.out.println("MLContextTest - RDD<String> good metadata DML");

	List<String> list = new ArrayList<>();
	list.add("1,1,1");
	list.add("2,2,2");
	list.add("3,3,3");
	JavaRDD<String> javaRDD = sc.parallelize(list);
	RDD<String> rdd = JavaRDD.toRDD(javaRDD);

	MatrixMetadata mm = new MatrixMetadata(3, 3, 9);

	Script script = dml("print('sum: ' + sum(M));").in("M", rdd, mm);
	setExpectedStdOut("sum: 18.0");
	ml.execute(script);
}
 
Example #10
Source File: MLContextConversionUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code MatrixObject} to a {@code RDD<String>} in IJV format.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @return the {@code MatrixObject} converted to a {@code RDD<String>}
 */
public static RDD<String> matrixObjectToRDDStringIJV(MatrixObject matrixObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = matrixObjectToListStringIJV(matrixObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}
 
Example #11
Source File: RankConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}
 
Example #12
Source File: SparkDatasetBoundedSourceVertex.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Constructor.
 *
 * @param sparkSession sparkSession to recreate on each executor.
 * @param dataset      Dataset to read data from.
 */
public SparkDatasetBoundedSourceVertex(final SparkSession sparkSession, final Dataset<T> dataset) {
  this.readables = new ArrayList<>();
  final RDD rdd = dataset.sparkRDD();
  final Partition[] partitions = rdd.getPartitions();
  for (int i = 0; i < partitions.length; i++) {
    readables.add(new SparkDatasetBoundedSourceReadable(
      partitions[i],
      sparkSession.getDatasetCommandsList(),
      sparkSession.getInitialConf(),
      i));
  }
  this.estimatedByteSize = dataset.javaRDD()
    .map(o -> (long) o.toString().getBytes("UTF-8").length)
    .reduce((a, b) -> a + b);
}
 
Example #13
Source File: AerospikeCellExtractorFT.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
@Override
public void testDataSet() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<Cells> inputConfigEntity = new ExtractorConfig(Cells.class);
        inputConfigEntity.putValue(ExtractorConstants.HOST, AerospikeJavaRDDFT.HOST)
                .putValue(ExtractorConstants.PORT, AerospikeJavaRDDFT.PORT)
                .putValue(ExtractorConstants.NAMESPACE, AerospikeJavaRDDFT.NAMESPACE_CELL)
                .putValue(ExtractorConstants.SET, ExtractorTest.BOOK_INPUT);
        inputConfigEntity.setExtractorImplClass(AerospikeCellExtractor.class);

        RDD<Cells> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

    } finally {
        context.stop();
    }

}
 
Example #14
Source File: DistinctConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        PODistinct poDistinct) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1);
    RDD<Tuple> rdd = predecessors.get(0);

    ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil
            .<Tuple, Object> getTuple2Manifest();

    RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION,
            tuple2ClassManifest);
    PairRDDFunctions<Tuple, Object> pairRDDFunctions
      = new PairRDDFunctions<Tuple, Object>(
            rddPairs, SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class), null);
    int parallelism = SparkUtil.getParallelism(predecessors, poDistinct);
    return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism)
            .map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class));
}
 
Example #15
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void createHDFSRDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRdd = mock(JavaRDD.class);
    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRdd);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();

    RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);

    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());

    verify(javaRdd, times(1)).map(any(Function.class));

}
 
Example #16
Source File: JdbcEntityExtractorFT.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
@Override
public void testDataSet() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<MessageTestEntity> inputConfigEntity = getReadExtractorConfig();
        RDD<MessageTestEntity> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

    } finally {
        context.stop();
    }

}
 
Example #17
Source File: SparkUtils.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@SuppressWarnings("rawtypes")
// TODO (wjk): remove this when we have a better way to change name of RDDs implicitly created within spark
private static void setAncestorRDDNames(org.apache.spark.rdd.RDD rdd, int levels, String[] newNames, String[] checkNames) {
    assert levels > 0;
    org.apache.spark.rdd.RDD currentRDD = rdd;
    for (int i = 0; i < levels && currentRDD != null; i++) {
        org.apache.spark.rdd.RDD rddAnc =
                ((org.apache.spark.Dependency) currentRDD.dependencies().head()).rdd();
        if (rddAnc != null) {
            if (checkNames == null || checkNames[i] == null)
                rddAnc.setName(newNames[i]);
            else if (rddAnc.name().equals(checkNames[i]))
                rddAnc.setName(newNames[i]);
        }
        currentRDD = rddAnc;
    }
}
 
Example #18
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testRDDSumIJVDML() {
	System.out.println("MLContextTest - RDD<String> IJV sum DML");

	List<String> list = new ArrayList<>();
	list.add("1 1 1");
	list.add("2 1 2");
	list.add("1 2 3");
	list.add("3 3 4");
	JavaRDD<String> javaRDD = sc.parallelize(list);
	RDD<String> rdd = JavaRDD.toRDD(javaRDD);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 3, 3);

	Script script = dml("print('sum: ' + sum(M));").in("M", rdd, mm);
	setExpectedStdOut("sum: 10.0");
	ml.execute(script);
}
 
Example #19
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void textFileHDFSTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createHDFSRDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createHDFSRDD(config);
}
 
Example #20
Source File: MLUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * Default implementation which randomly splits new data into train/test sets.
 * This handles the case where {@link #getTestFraction()} is not 0 or 1.
 *
 * @param newData data that has arrived in the current input batch
 * @return a {@link Pair} of train, test {@link RDD}s.
 */
protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) {
  RDD<M>[] testTrainRDDs = newData.rdd().randomSplit(
      new double[]{1.0 - testFraction, testFraction},
      RandomManager.getRandom().nextLong());
  return new Pair<>(newData.wrapRDD(testTrainRDDs[0]),
                    newData.wrapRDD(testTrainRDDs[1]));
}
 
Example #21
Source File: SourceDStream.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.Option<RDD<Tuple2<Source<T>, CheckpointMarkT>>> compute(Time validTime) {
  RDD<Tuple2<Source<T>, CheckpointMarkT>> rdd =
      new SourceRDD.Unbounded<>(
          ssc().sparkContext(), options, createMicrobatchSource(), numPartitions);
  return scala.Option.apply(rdd);
}
 
Example #22
Source File: FilterConverter.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POFilter physicalOperator) {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    FilterFunction filterFunction = new FilterFunction(physicalOperator);
    return rdd.filter(filterFunction);
}
 
Example #23
Source File: ForEachConverter.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POForEach physicalOperator) {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    ForEachFunction forEachFunction = new ForEachFunction(physicalOperator, this.confBytes);
    return rdd.toJavaRDD().mapPartitions(forEachFunction, true).rdd();
}
 
Example #24
Source File: CounterConverter.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
	public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, 
			POCounter poCounter) throws IOException {
		SparkUtil.assertPredecessorSize(predecessors, poCounter, 1);
        RDD<Tuple> rdd = predecessors.get(0);
        CounterConverterFunction f = new CounterConverterFunction(poCounter);
        JavaRDD<Tuple> jRdd = rdd.toJavaRDD().mapPartitionsWithIndex(f, true);
//        jRdd = jRdd.cache();
        return jRdd.rdd();
	}
 
Example #25
Source File: MaprStreamsOffsetManagerImpl.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void saveOffsets(RDD<?> rdd) {
  Map<Integer, Long> offset = getOffsetToSave(((HasOffsetRanges) rdd).offsetRanges());
  if (!offset.isEmpty()) {
    SparkStreamingBinding.offsetHelper.saveOffsets(offset);
  } else {
    LOG.trace("Offset is empty");
  }
}
 
Example #26
Source File: SparkDDF.java    From DDF with Apache License 2.0 5 votes vote down vote up
public <T> SparkDDF(DDFManager manager, RDD<?> rdd, Class<T> unitType, String name, Schema schema)
    throws DDFException {

  super(manager);
  if (rdd == null) throw new DDFException("Non-null RDD is required to instantiate a new SparkDDF");
  this.initialize(manager, rdd, new Class<?>[] { RDD.class, unitType }, name, schema);
}
 
Example #27
Source File: SqlHandler.java    From DDF with Apache License 2.0 5 votes vote down vote up
@Override
public DDF sql2ddf(String command, Schema schema, DataSourceDescriptor dataSource, DataFormat dataFormat) throws DDFException {
  //    TableRDD tableRdd = null;
  //    RDD<Row> rddRow = null;

  DataFrame rdd = this.getHiveContext().sql(command);
  if (schema == null) schema = SchemaHandler.getSchemaFromDataFrame(rdd);
  DDF ddf = this.getManager().newDDF(this.getManager(), rdd, new Class<?>[]
                  {DataFrame.class}, null, schema);
  ddf.getRepresentationHandler().cache(false);
  ddf.getRepresentationHandler().get(new Class<?>[]{RDD.class, Row.class});
  return ddf;
}
 
Example #28
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void createS3RDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    Configuration hadoopConf = mock(Configuration.class);
    when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf);
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRDD = mock(JavaRDD.class);

    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRDD);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createS3DeepJobConfig();

    deepSparkContextSpy.createS3RDD(config);

    verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
    verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());
    verify(javaRDD, times(1)).map(any(Function.class));
}
 
Example #29
Source File: SparkStreamingSqlAnalyse.java    From sylph with Apache License 2.0 5 votes vote down vote up
/**
 * 预编译sql 而不是等到运行时,才发现错误
 * Precompiled sql instead of waiting for the runtime to find the error
 */
private static void checkDStream(
        SparkSession spark,
        String sourceTableName,
        StructType sourceSchema,
        List<Consumer<SparkSession>> handlers
)
{
    RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class));
    Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema);
    df.createOrReplaceTempView(sourceTableName);
    handlers.forEach(x -> x.accept(spark));
    spark.sql("drop view " + sourceTableName);
}
 
Example #30
Source File: SparkSession.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> createDataFrame(final RDD<?> rdd, final Class<?> beanClass) {
  final boolean userTriggered = initializeFunction(rdd, beanClass);
  final Dataset<Row> result = Dataset.from(super.createDataFrame(rdd, beanClass));
  this.setIsUserTriggered(userTriggered);
  return result;
}