Java Code Examples for org.apache.spark.rdd.RDD

The following examples show how to use org.apache.spark.rdd.RDD. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: deep-spark   Source File: DeepSparkContext.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns a Cells RDD from HDFS.
 * @param config HDFS ExtractorConfig.
 * @return Cells RDD.
 */
public RDD<Cells> createHDFSRDD(ExtractorConfig<Cells> config) {

    Serializable host = config.getValues().get(ExtractorConstants.HOST);
    Serializable port = config.getValues().get(ExtractorConstants.PORT);
    Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH);

    final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this);

    String filePath = path.toString();
    if (config.getExtractorImplClassName().equals(ExtractorConstants.HDFS)) {
        filePath = ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString();
    }

    return createRDDFromFilePath(filePath, textFileDataTable);
}
 
Example 2
Source Project: oryx   Source File: Evaluation.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
      testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> testUserProducts =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
  JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
  double mse = predictions.mapToPair(
      rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
  ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
    double diff = valuePrediction._1() - valuePrediction._2();
    return diff * diff;
  }).mean();
  return Math.sqrt(mse);
}
 
Example 3
Source Project: spork   Source File: CollectedGroupConverter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
    POCollectedGroup physicalOperator) throws IOException {
  SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
  RDD<Tuple> rdd = predecessors.get(0);
  // return predecessors.get(0);
  RDD<Tuple> rdd2 = rdd.coalesce(1, false, null);
  long count = 0;
  try {

    count = rdd2.count();

  } catch (Exception e) {

  }
  CollectedGroupFunction collectedGroupFunction
      = new CollectedGroupFunction(physicalOperator, count);
  return rdd.toJavaRDD().mapPartitions(collectedGroupFunction, true).rdd();
}
 
Example 4
Source Project: systemds   Source File: MLContextConversionUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code MatrixObject} to a {@code RDD<String>} in IJV format.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @return the {@code MatrixObject} converted to a {@code RDD<String>}
 */
public static RDD<String> matrixObjectToRDDStringIJV(MatrixObject matrixObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = matrixObjectToListStringIJV(matrixObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}
 
Example 5
Source Project: systemds   Source File: MLContextConversionUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code FrameObject} to a {@code RDD<String>} in IJV format.
 *
 * @param frameObject
 *            the {@code FrameObject}
 * @return the {@code FrameObject} converted to a {@code RDD<String>}
 */
public static RDD<String> frameObjectToRDDStringIJV(FrameObject frameObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = frameObjectToListStringIJV(frameObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}
 
Example 6
Source Project: spork   Source File: LoadConverter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad)
        throws IOException {
    // if (predecessors.size()!=0) {
    // throw new
    // RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
    // }

    JobConf loadJobConf = SparkUtil.newJobConf(pigContext);
    configureLoader(physicalPlan, poLoad, loadJobConf);

    // don't know why but just doing this cast for now
    RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile(
            poLoad.getLFile().getFileName(), PigInputFormatSpark.class,
            Text.class, Tuple.class, loadJobConf);

    registerUdfFiles();
    // map to get just RDD<Tuple>
    return hadoopRDD.map(TO_TUPLE_FUNCTION,
            SparkUtil.getManifest(Tuple.class));
}
 
Example 7
Source Project: deep-spark   Source File: ExtractorTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Test filter EQ.
 *
 * @param <W> the type parameter
 */
@Test(alwaysRun = true, dependsOnGroups = { "FunctionalTests" })
protected <W> void testFilterEQ() {
    DeepSparkContext context = getDeepSparkContext();
    try {

        Filter[] filters = null;

        Filter filter = new Filter("id", FilterType.EQ, "TestDataSet");
        filters = new Filter[] { filter };
        ExtractorConfig<W> inputConfigEntity2 = getFilterConfig(filters);

        RDD<W> inputRDDEntity2 = context.createRDD(inputConfigEntity2);
        assertEquals(inputRDDEntity2.count(), 1);
    } finally {
        context.stop();
    }

}
 
Example 8
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRDDGoodMetadataDML() {
	System.out.println("MLContextTest - RDD<String> good metadata DML");

	List<String> list = new ArrayList<>();
	list.add("1,1,1");
	list.add("2,2,2");
	list.add("3,3,3");
	JavaRDD<String> javaRDD = sc.parallelize(list);
	RDD<String> rdd = JavaRDD.toRDD(javaRDD);

	MatrixMetadata mm = new MatrixMetadata(3, 3, 9);

	Script script = dml("print('sum: ' + sum(M));").in("M", rdd, mm);
	setExpectedStdOut("sum: 18.0");
	ml.execute(script);
}
 
Example 9
Source Project: systemds   Source File: MLContextConversionUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code MatrixObject} to a {@code RDD<String>} in IJV format.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @return the {@code MatrixObject} converted to a {@code RDD<String>}
 */
public static RDD<String> matrixObjectToRDDStringIJV(MatrixObject matrixObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = matrixObjectToListStringIJV(matrixObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}
 
Example 10
Source Project: spork   Source File: RankConverter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}
 
Example 11
/**
 * Constructor.
 *
 * @param sparkSession sparkSession to recreate on each executor.
 * @param dataset      Dataset to read data from.
 */
public SparkDatasetBoundedSourceVertex(final SparkSession sparkSession, final Dataset<T> dataset) {
  this.readables = new ArrayList<>();
  final RDD rdd = dataset.sparkRDD();
  final Partition[] partitions = rdd.getPartitions();
  for (int i = 0; i < partitions.length; i++) {
    readables.add(new SparkDatasetBoundedSourceReadable(
      partitions[i],
      sparkSession.getDatasetCommandsList(),
      sparkSession.getInitialConf(),
      i));
  }
  this.estimatedByteSize = dataset.javaRDD()
    .map(o -> (long) o.toString().getBytes("UTF-8").length)
    .reduce((a, b) -> a + b);
}
 
Example 12
Source Project: spork   Source File: DistinctConverter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        PODistinct poDistinct) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1);
    RDD<Tuple> rdd = predecessors.get(0);

    ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil
            .<Tuple, Object> getTuple2Manifest();

    RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION,
            tuple2ClassManifest);
    PairRDDFunctions<Tuple, Object> pairRDDFunctions
      = new PairRDDFunctions<Tuple, Object>(
            rddPairs, SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class), null);
    int parallelism = SparkUtil.getParallelism(predecessors, poDistinct);
    return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism)
            .map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class));
}
 
Example 13
Source Project: deep-spark   Source File: JdbcEntityExtractorFT.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@Override
public void testDataSet() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<MessageTestEntity> inputConfigEntity = getReadExtractorConfig();
        RDD<MessageTestEntity> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

    } finally {
        context.stop();
    }

}
 
Example 14
Source Project: DDF   Source File: MLMetricsSupporter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public DDF residuals() throws DDFException {
  SparkDDF predictionDDF = (SparkDDF) this.getDDF();
  JavaRDD<double[]> predictionRDD = predictionDDF.getJavaRDD(double[].class);

  JavaRDD<double[]> result = predictionRDD.map(new MetricsMapperResiduals());

  if (result == null) mLog.error(">> javaRDD result of MetricMapper residuals is null");
  if (predictionDDF.getManager() == null) mLog.error(">> predictionDDF.getManager() is null");
  if (result.rdd() == null) mLog.error(">> result.rdd() is null");
  if (predictionDDF.getSchema() == null) mLog.error(">> predictionDDF.getSchema() is null");
  if (predictionDDF.getName() == null) mLog.error(">> predictionDDF.getName() is null");

  Schema schema = new Schema("residuals double");
  DDFManager manager = this.getDDF().getManager();
  DDF residualDDF = manager
      .newDDF(manager, result.rdd(), new Class<?>[] { RDD.class, double[].class }, null,
          schema);

  if (residualDDF == null) mLog.error(">>>>>>>>>>>.residualDDF is null");

  return residualDDF;
}
 
Example 15
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRDDSumIJVDML() {
	System.out.println("MLContextTest - RDD<String> IJV sum DML");

	List<String> list = new ArrayList<>();
	list.add("1 1 1");
	list.add("2 1 2");
	list.add("1 2 3");
	list.add("3 3 4");
	JavaRDD<String> javaRDD = sc.parallelize(list);
	RDD<String> rdd = JavaRDD.toRDD(javaRDD);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 3, 3);

	Script script = dml("print('sum: ' + sum(M));").in("M", rdd, mm);
	setExpectedStdOut("sum: 10.0");
	ml.execute(script);
}
 
Example 16
@SuppressWarnings("rawtypes")
// TODO (wjk): remove this when we have a better way to change name of RDDs implicitly created within spark
private static void setAncestorRDDNames(org.apache.spark.rdd.RDD rdd, int levels, String[] newNames, String[] checkNames) {
    assert levels > 0;
    org.apache.spark.rdd.RDD currentRDD = rdd;
    for (int i = 0; i < levels && currentRDD != null; i++) {
        org.apache.spark.rdd.RDD rddAnc =
                ((org.apache.spark.Dependency) currentRDD.dependencies().head()).rdd();
        if (rddAnc != null) {
            if (checkNames == null || checkNames[i] == null)
                rddAnc.setName(newNames[i]);
            else if (rddAnc.name().equals(checkNames[i]))
                rddAnc.setName(newNames[i]);
        }
        currentRDD = rddAnc;
    }
}
 
Example 17
Source Project: deep-spark   Source File: DeepSparkContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void createHDFSRDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRdd = mock(JavaRDD.class);
    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRdd);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();

    RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);

    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());

    verify(javaRdd, times(1)).map(any(Function.class));

}
 
Example 18
Source Project: deep-spark   Source File: AerospikeCellExtractorFT.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@Override
public void testDataSet() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<Cells> inputConfigEntity = new ExtractorConfig(Cells.class);
        inputConfigEntity.putValue(ExtractorConstants.HOST, AerospikeJavaRDDFT.HOST)
                .putValue(ExtractorConstants.PORT, AerospikeJavaRDDFT.PORT)
                .putValue(ExtractorConstants.NAMESPACE, AerospikeJavaRDDFT.NAMESPACE_CELL)
                .putValue(ExtractorConstants.SET, ExtractorTest.BOOK_INPUT);
        inputConfigEntity.setExtractorImplClass(AerospikeCellExtractor.class);

        RDD<Cells> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

    } finally {
        context.stop();
    }

}
 
Example 19
Source Project: oryx   Source File: MLUpdate.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Default implementation which randomly splits new data into train/test sets.
 * This handles the case where {@link #getTestFraction()} is not 0 or 1.
 *
 * @param newData data that has arrived in the current input batch
 * @return a {@link Pair} of train, test {@link RDD}s.
 */
protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) {
  RDD<M>[] testTrainRDDs = newData.rdd().randomSplit(
      new double[]{1.0 - testFraction, testFraction},
      RandomManager.getRandom().nextLong());
  return new Pair<>(newData.wrapRDD(testTrainRDDs[0]),
                    newData.wrapRDD(testTrainRDDs[1]));
}
 
Example 20
Source Project: DDF   Source File: SqlHandler.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public DDF sql2ddf(String command, Schema schema, DataSourceDescriptor dataSource, DataFormat dataFormat) throws DDFException {
  //    TableRDD tableRdd = null;
  //    RDD<Row> rddRow = null;

  DataFrame rdd = this.getHiveContext().sql(command);
  if (schema == null) schema = SchemaHandler.getSchemaFromDataFrame(rdd);
  DDF ddf = this.getManager().newDDF(this.getManager(), rdd, new Class<?>[]
                  {DataFrame.class}, null, schema);
  ddf.getRepresentationHandler().cache(false);
  ddf.getRepresentationHandler().get(new Class<?>[]{RDD.class, Row.class});
  return ddf;
}
 
Example 21
Source Project: DDF   Source File: BasicStatisticsComputer.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public Summary[] getSummaryImpl() throws DDFException {
  RDD<Object[]> rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class);

  JavaRDD<Object[]> data = rdd.toJavaRDD();
  Summary[] stats = data.map(new GetSummaryMapper()).reduce(new GetSummaryReducer());
  return stats;
}
 
Example 22
Source Project: oryx   Source File: Evaluation.java    License: Apache License 2.0 5 votes vote down vote up
private static JavaPairRDD<Integer,Iterable<Rating>> predictAll(
    MatrixFactorizationModel mfModel,
    JavaRDD<Rating> data,
    JavaPairRDD<Integer,Integer> userProducts) {
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> userProductsRDD =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) userProducts.rdd();
  return data.wrapRDD(mfModel.predict(userProductsRDD)).groupBy(Rating::user);
}
 
Example 23
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputRDDStringIJVDML() {
	System.out.println("MLContextTest - output RDD String IJV DML");

	String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
	Script script = dml(s).out("M");
	MLResults results = ml.execute(script);
	RDD<String> rddStringIJV = results.getRDDStringIJV("M");
	Iterator<String> iterator = rddStringIJV.toLocalIterator();
	Assert.assertEquals("1 1 1.0", iterator.next());
	Assert.assertEquals("1 2 2.0", iterator.next());
	Assert.assertEquals("2 1 3.0", iterator.next());
	Assert.assertEquals("2 2 4.0", iterator.next());
}
 
Example 24
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputRDDStringCSVFromMatrixDML() {
	System.out.println("MLContextTest - output RDD String CSV from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=1, cols=4);";
	Script script = dml(s).out("M");
	RDD<String> rddStringCSV = ml.execute(script).getMatrix("M").toRDDStringCSV();
	Iterator<String> iterator = rddStringCSV.toLocalIterator();
	Assert.assertEquals("1.0,2.0,3.0,4.0", iterator.next());
}
 
Example 25
Source Project: spork   Source File: PackageConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POPackage physicalOperator) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    // package will generate the group from the result of the local
    // rearrange
    return rdd.map(new PackageFunction(physicalOperator, this.confBytes),
            SparkUtil.getManifest(Tuple.class));
}
 
Example 26
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputRDDStringIJVDML() {
	System.out.println("MLContextTest - output RDD String IJV DML");

	String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
	Script script = dml(s).out("M");
	MLResults results = ml.execute(script);
	RDD<String> rddStringIJV = results.getRDDStringIJV("M");
	Iterator<String> iterator = rddStringIJV.toLocalIterator();
	Assert.assertEquals("1 1 1.0", iterator.next());
	Assert.assertEquals("1 2 2.0", iterator.next());
	Assert.assertEquals("2 1 3.0", iterator.next());
	Assert.assertEquals("2 2 4.0", iterator.next());
}
 
Example 27
Source Project: deep-spark   Source File: JdbcCellExtractorFT.java    License: Apache License 2.0 5 votes vote down vote up
@Test
@Override
public void testInputColumns() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        String [] inputColumns = new String [1];
        inputColumns[0] = "id";
        ExtractorConfig<Cells> inputConfigEntity = getReadExtractorConfig();
        inputConfigEntity.putValue(ExtractorConstants.INPUT_COLUMNS, inputColumns);
        RDD<Cells> inputRDDEntity = context.createRDD(inputConfigEntity);

        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");
        Cells cells = inputRDDEntity.first();
        assertNotNull(cells.getCellByName("id"));
        assertNull(cells.getCellByName("message"));
        assertNull(cells.getCellByName("number"));

        String [] inputColumns2 = new String [2];
        inputColumns2[0] = "message";
        inputColumns2[1] = "number";
        ExtractorConfig<Cells> inputConfigEntity2 = getReadExtractorConfig();
        inputConfigEntity2.putValue(ExtractorConstants.INPUT_COLUMNS, inputColumns2);
        RDD<Cells> inputRDDEntity2 = context.createRDD(inputConfigEntity2);

        assertEquals(inputRDDEntity2.count(), 1, "Expected read entity count is 1");
        Cells cells2 = inputRDDEntity2.first();
        assertNull(cells2.getCellByName("id"));
        assertNotNull(cells2.getCellByName("message"));
        assertNotNull(cells2.getCellByName("number"));

    } finally {
        context.stop();
    }
}
 
Example 28
Source Project: spork   Source File: LimitConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POLimit poLimit)
        throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, poLimit, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    LimitFunction limitFunction = new LimitFunction(poLimit);
    RDD<Tuple> rdd2 = rdd.coalesce(1, false, null);
    return rdd2.toJavaRDD().mapPartitions(limitFunction, false).rdd();
}
 
Example 29
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputRDDStringCSVFromMatrixDML() {
	System.out.println("MLContextTest - output RDD String CSV from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=1, cols=4);";
	Script script = dml(s).out("M");
	RDD<String> rddStringCSV = ml.execute(script).getMatrix("M").toRDDStringCSV();
	Iterator<String> iterator = rddStringCSV.toLocalIterator();
	Assert.assertEquals("1.0,2.0,3.0,4.0", iterator.next());
}
 
Example 30
Source Project: deep-spark   Source File: JdbcEntityExtractorFT.java    License: Apache License 2.0 5 votes vote down vote up
@Test
@Override
public void testFilterNEQ() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<MessageTestEntity> inputConfigEntity = getReadExtractorConfig();
        Filter[] filters = new Filter[1];
        Filter filter = new Filter("number", FilterType.NEQ, 1);
        filters[0] = filter;
        inputConfigEntity.putValue(ExtractorConstants.FILTER_QUERY, filters);
        RDD<MessageTestEntity> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

        ExtractorConfig<MessageTestEntity> inputConfigEntity2 = getReadExtractorConfig();
        Filter[] filters2 = new Filter[1];
        Filter filter2 = new Filter("number", FilterType.NEQ, 3);
        filters2[0] = filter2;
        inputConfigEntity2.putValue(ExtractorConstants.FILTER_QUERY, filters2);
        RDD<MessageTestEntity> inputRDDEntity2 = context.createRDD(inputConfigEntity2);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity2.count(), 0, "Expected read entity count is 1");

    } finally {
        context.stop();
    }
}