Java Code Examples for org.apache.spark.api.java.JavaRDD#rdd()

The following examples show how to use org.apache.spark.api.java.JavaRDD#rdd() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: JavaPCAExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("PCA Example");
  SparkContext sc = new SparkContext(conf);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 principal components.
  Matrix pc = mat.computePrincipalComponents(3);
  RowMatrix projected = mat.multiply(pc);
  // $example off$
  Vector[] collectPartitions = (Vector[])projected.rows().collect();
  System.out.println("Projected vector of principal component:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
}

Example 2

Source File: SubStringCounterRelation.java From net.jgp.labs.spark with Apache License 2.0

6 votes

@Override
public RDD<Row> buildScan() {
  log.debug("-> buildScan()");

  // I have isolated the work to a method to keep the plumbing code as simple
  // as
  // possible.
  List<List<Integer>> table = collectData();

  @SuppressWarnings("resource") // cannot be closed here, done elsewhere
  JavaSparkContext sparkContext = new JavaSparkContext(sqlContext
      .sparkContext());
  JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
      .map(row -> RowFactory.create(row.toArray()));

  return rowRDD.rdd();
}

Example 3

Source File: SparkConverter.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}

Example 4

Source File: MLMetricsSupporter.java From DDF with Apache License 2.0

6 votes

@Override
public DDF residuals() throws DDFException {
  SparkDDF predictionDDF = (SparkDDF) this.getDDF();
  JavaRDD<double[]> predictionRDD = predictionDDF.getJavaRDD(double[].class);

  JavaRDD<double[]> result = predictionRDD.map(new MetricsMapperResiduals());

  if (result == null) mLog.error(">> javaRDD result of MetricMapper residuals is null");
  if (predictionDDF.getManager() == null) mLog.error(">> predictionDDF.getManager() is null");
  if (result.rdd() == null) mLog.error(">> result.rdd() is null");
  if (predictionDDF.getSchema() == null) mLog.error(">> predictionDDF.getSchema() is null");
  if (predictionDDF.getName() == null) mLog.error(">> predictionDDF.getName() is null");

  Schema schema = new Schema("residuals double");
  DDFManager manager = this.getDDF().getManager();
  DDF residualDDF = manager
      .newDDF(manager, result.rdd(), new Class<?>[] { RDD.class, double[].class }, null,
          schema);

  if (residualDDF == null) mLog.error(">>>>>>>>>>>.residualDDF is null");

  return residualDDF;
}

Example 5

Source File: SparkConverter.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}

Example 6

Source File: RankConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}

Example 7

Source File: StoreConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple2<Text, Tuple>> convert(List<RDD<Tuple>> predecessors,
        POStore physicalOperator) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    // convert back to KV pairs
    JavaRDD<Tuple2<Text, Tuple>> rddPairs = rdd.toJavaRDD().map(FROM_TUPLE_FUNCTION);

    PairRDDFunctions<Text, Tuple> pairRDDFunctions = new PairRDDFunctions<Text, Tuple>(
            rddPairs.rdd(), SparkUtil.getManifest(Text.class),
            SparkUtil.getManifest(Tuple.class), null);

    JobConf storeJobConf = SparkUtil.newJobConf(pigContext);
    POStore poStore = configureStorer(storeJobConf, physicalOperator);

    pairRDDFunctions.saveAsNewAPIHadoopFile(poStore.getSFile()
            .getFileName(), Text.class, Tuple.class, PigOutputFormat.class,
            storeJobConf);

    return rddPairs.rdd();
}

Example 8

Source File: SortConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator)
        throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(),
            SparkUtil.<Tuple, Object> getTuple2Manifest());

    JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair,
            SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class));

    JavaPairRDD<Tuple, Object> sorted = r.sortByKey(
            sortOperator.new SortComparator(), true);
    JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION);

    return mapped.rdd();
}

Example 9

Source File: JavaSVDExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

  jsc.stop();
}

Example 10

Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}

Example 11

Source File: SparkFileInputStream.java From incubator-retired-mrql with Apache License 2.0

5 votes

@Override
public Option<RDD<MRData>> compute ( Time validTime ) {
    JavaRDD<MRData> rdd = null;
    for ( String file: new_files() )
        if (rdd == null)
            rdd = hadoopFile(file);
        else rdd = rdd.union(hadoopFile(file));
    if (rdd == null)
        rdd = SparkEvaluator.spark_context.emptyRDD();
    return new Some<RDD<MRData>>(rdd.rdd());
}

Example 12

Source File: CounterConverter.java From spork with Apache License 2.0

5 votes

@Override
	public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, 
			POCounter poCounter) throws IOException {
		SparkUtil.assertPredecessorSize(predecessors, poCounter, 1);
        RDD<Tuple> rdd = predecessors.get(0);
        CounterConverterFunction f = new CounterConverterFunction(poCounter);
        JavaRDD<Tuple> jRdd = rdd.toJavaRDD().mapPartitionsWithIndex(f, true);
//        jRdd = jRdd.cache();
        return jRdd.rdd();
	}

Example 13

Source File: ChronixRDD.java From chronix.spark with Apache License 2.0

4 votes

public ChronixRDD(JavaRDD<MetricTimeSeries> tsRdd) {
    super(tsRdd.rdd(), MTS_TYPE);
}

Example 14

Source File: DeepSparkContext.java From deep-spark with Apache License 2.0

4 votes

private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) {
    RDD<String> result = this.sc().textFile(filePath.toString(), 1);
    JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable));
    return resultCells.rdd();
}

Example 15

Source File: ALSUpdate.java From oryx with Apache License 2.0

4 votes

@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int features = (Integer) hyperParameters.get(0);
  double lambda = (Double) hyperParameters.get(1);
  double alpha = (Double) hyperParameters.get(2);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = (Double) hyperParameters.get(3);
  }
  Preconditions.checkArgument(features > 0);
  Preconditions.checkArgument(lambda >= 0.0);
  Preconditions.checkArgument(alpha > 0.0);
  if (logStrength) {
    Preconditions.checkArgument(epsilon > 0.0);
  }

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  parsedRDD.cache();

  Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
  Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDIndexMap.size(), itemIDIndexMap.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);

  JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
  trainRatingData = aggregateScores(trainRatingData, epsilon);
  ALS als = new ALS()
      .setRank(features)
      .setIterations(iterations)
      .setLambda(lambda)
      .setCheckpointInterval(5);
  if (implicit) {
    als = als.setImplicitPrefs(true).setAlpha(alpha);
  }

  RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
  trainingRatingDataRDD.cache();
  MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
  trainingRatingDataRDD.unpersist(false);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  parsedRDD.unpersist();

  Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
  Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));

  PMML pmml = mfModelToPMML(model,
                            features,
                            lambda,
                            alpha,
                            epsilon,
                            implicit,
                            logStrength,
                            candidatePath,
                            bUserIndexToID,
                            bItemIndexToID);
  unpersist(model);

  bUserIndexToID.unpersist();
  bItemIndexToID.unpersist();

  return pmml;
}

Example 16

Source File: SkewedJoinConverter.java From spork with Apache License 2.0

4 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
                          POSkewedJoin poSkewedJoin) throws IOException {

    SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2);
    LRs = new POLocalRearrange[2];
    this.poSkewedJoin = poSkewedJoin;

    createJoinPlans(poSkewedJoin.getJoinPlans());

    // extract the two RDDs
    RDD<Tuple> rdd1 = predecessors.get(0);
    RDD<Tuple> rdd2 = predecessors.get(1);

    // make (key, value) pairs, key has type Object, value has type Tuple
    RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction(
            this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest());
    RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction(
            this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest());

    // join fn is present in JavaPairRDD class ..
    JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
            rdd1Pair, SparkUtil.getManifest(Object.class),
            SparkUtil.getManifest(Tuple.class));
    JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
            rdd2Pair, SparkUtil.getManifest(Object.class),
            SparkUtil.getManifest(Tuple.class));

    // do the join
    JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD
            .join(rdd2Pair_javaRDD);

    // map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by
    // ignoring the key (of type Object) and appending the values (the
    // Tuples)
    JavaRDD<Tuple> result = result_KeyValue
            .mapPartitions(new ToValueFunction());

    // return type is RDD<Tuple>, so take it from JavaRDD<Tuple>
    return result.rdd();
}