org.apache.spark.mllib.linalg.Vectors Java Examples

The following examples show how to use org.apache.spark.mllib.linalg.Vectors. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaSummaryStatisticsExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaRDD<Vector> mat = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    ); // an RDD of Vectors

    // Compute column summary statistics.
    MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
    System.out.println(summary.mean());  // a dense vector containing the mean value for each column
    System.out.println(summary.variance());  // column-wise variance
    System.out.println(summary.numNonzeros());  // number of nonzeros in each column
    // $example off$

    jsc.stop();
  }
 
Example #2
Source File: Model.java    From DDF with Apache License 2.0 6 votes vote down vote up
@Override
public Double predict(double[] point) throws DDFException {
  MLClassMethods.PredictMethod predictMethod= new MLClassMethods.PredictMethod(this.getRawModel(), MLClassMethods.DEFAULT_PREDICT_METHOD_NAME,
    new Class<?>[]{Vector.class});
  if(predictMethod.getMethod() == null) {
    throw new DDFException(String.format("Cannot locate method specified by %s", MLClassMethods.DEFAULT_PREDICT_METHOD_NAME));

  }
  Object prediction = predictMethod.instanceInvoke(Vectors.dense(point));
  if(prediction instanceof Double) {
    return (Double) prediction;
  } else if (prediction instanceof Integer) {
    return ((Integer) prediction).doubleValue();
  } else {
    throw new DDFException(String.format("Error getting prediction from model %s", this.getRawModel().getClass().getName()));
  }
}
 
Example #3
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example #4
Source File: SparkConverter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example #5
Source File: JavaPCAExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("PCA Example");
  SparkContext sc = new SparkContext(conf);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 principal components.
  Matrix pc = mat.computePrincipalComponents(3);
  RowMatrix projected = mat.multiply(pc);
  // $example off$
  Vector[] collectPartitions = (Vector[])projected.rows().collect();
  System.out.println("Projected vector of principal component:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
}
 
Example #6
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a labeled point of the writables
 * where the final item is the point and the rest of the items are
 * features
 * @param writables the writables
 * @return the labeled point
 */
public static LabeledPoint pointOf(Collection<Writable> writables) {
    double[] ret = new double[writables.size() - 1];
    int count = 0;
    double target = 0;
    for (Writable w : writables) {
        if (count < writables.size() - 1)
            ret[count++] = Float.parseFloat(w.toString());
        else
            target = Float.parseFloat(w.toString());
    }

    if (target < 0)
        throw new IllegalStateException("Target must be >= 0");
    return new LabeledPoint(target, Vectors.dense(ret));
}
 
Example #7
Source File: FeatureValueInstanceUtils.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
public static Vector convertToSparkMLVector(FeatureValueInstance fvi, int vectorSize) {
  Map<Integer, Double> featureValues = fvi.getFeatureValues();
  List<Tuple2<Integer, Double>> sortedFeatureValues =
      featureValues.entrySet().stream()
          .sorted((o1, o2) -> Integer.compare(o1.getKey(), o2.getKey()))
          .map(o -> new Tuple2<>(o.getKey(), o.getValue()))
          .collect(Collectors.toList());

  int[] features = new int[sortedFeatureValues.size()];
  double[] values = new double[sortedFeatureValues.size()];

  int i = 0;
  for (Tuple2<Integer, Double> fv : sortedFeatureValues) {
    features[i] = fv._1();
    values[i] = fv._2();
    ++i;
  }

  Vector v = Vectors.sparse(vectorSize, features, values);
  return v;
}
 
Example #8
Source File: JavaSVDExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

  jsc.stop();
}
 
Example #9
Source File: JavaBisectingKMeansExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaBisectingKMeansExample");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);

  // $example on$
  ArrayList<Vector> localData = Lists.newArrayList(
    Vectors.dense(0.1, 0.1),   Vectors.dense(0.3, 0.3),
    Vectors.dense(10.1, 10.1), Vectors.dense(10.3, 10.3),
    Vectors.dense(20.1, 20.1), Vectors.dense(20.3, 20.3),
    Vectors.dense(30.1, 30.1), Vectors.dense(30.3, 30.3)
  );
  JavaRDD<Vector> data = sc.parallelize(localData, 2);

  BisectingKMeans bkm = new BisectingKMeans()
    .setK(4);
  BisectingKMeansModel model = bkm.run(data);

  System.out.println("Compute Cost: " + model.computeCost(data));

  Vector[] clusterCenters = model.clusterCenters();
  for (int i = 0; i < clusterCenters.length; i++) {
    Vector clusterCenter = clusterCenters[i];
    System.out.println("Cluster Center " + i + ": " + clusterCenter);
  }
  // $example off$

  sc.stop();
}
 
Example #10
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert an ndarray to a vector
 * @param arr the array
 * @return an mllib vector
 */
public static Vector toVector(INDArray arr) {
    if (!arr.isVector()) {
        throw new IllegalArgumentException("passed in array must be a vector");
    }
    if (arr.length() > Integer.MAX_VALUE)
        throw new ND4JArraySizeException();
    double[] ret = new double[(int) arr.length()];
    for (int i = 0; i < arr.length(); i++) {
        ret[i] = arr.getDouble(i);
    }

    return Vectors.dense(ret);
}
 
Example #11
Source File: JavaGaussianMixtureExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/gmm_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using GaussianMixture
    GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());

    // Save and load GaussianMixtureModel
    gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel");
    GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(),
      "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel");

    // Output the parameters of the mixture model
    for (int j = 0; j < gmm.k(); j++) {
      System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
        gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
    }
    // $example off$

    jsc.stop();
  }
 
Example #12
Source File: RDFUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private JavaRDD<LabeledPoint> parseToLabeledPointRDD(
    JavaRDD<String[]> parsedRDD,
    CategoricalValueEncodings categoricalValueEncodings) {

  return parsedRDD.map(data -> {
    try {
      double[] features = new double[inputSchema.getNumPredictors()];
      double target = Double.NaN;
      for (int featureIndex = 0; featureIndex < data.length; featureIndex++) {
        double encoded;
        if (inputSchema.isNumeric(featureIndex)) {
          encoded = Double.parseDouble(data[featureIndex]);
        } else if (inputSchema.isCategorical(featureIndex)) {
          Map<String,Integer> valueEncoding =
              categoricalValueEncodings.getValueEncodingMap(featureIndex);
          encoded = valueEncoding.get(data[featureIndex]);
        } else {
          continue;
        }
        if (inputSchema.isTarget(featureIndex)) {
          target = encoded;
        } else {
          features[inputSchema.featureToPredictorIndex(featureIndex)] = encoded;
        }
      }
      Preconditions.checkState(!Double.isNaN(target));
      return new LabeledPoint(target, Vectors.dense(features));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}
 
Example #13
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) {
  return parsedRDD.map(data -> {
    try {
      return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}
 
Example #14
Source File: JavaCorrelationsExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
      )
    );

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    System.out.println(correlMatrix.toString());
    // $example off$

    jsc.stop();
  }
 
Example #15
Source File: MinMaxScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testStandardScaler() {
    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])),
            new LabeledPoint(3.0, Vectors.dense(data[3])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    //train model in spark
    MinMaxScalerModel sparkModel = new MinMaxScaler()
            .setInputCol("features")
            .setOutputCol("scaled")
            .setMin(-5)
            .setMax(5)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModel, df);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collect();
    assertCorrectness(sparkOutput, expected, transformer);
}
 
Example #16
Source File: Log1PScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomScalerDenseVector() {
    final double precomputedAns[][] = new double[3][3];
    //precompute answers
        for (int j = 0; j < 3; j++)
            for (int k = 0; k < 3; k++)
                precomputedAns[j][k] = Math.log1p(data[j][k]);

    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    for (int i = 0; i < 2; i++) {
        //train model in spark
        Log1PScaler sparkModel = new Log1PScaler()
                .setInputCol("features")
                .setOutputCol("scaledOutput");

        //Export model, import it back and get transformer
        byte[] exportedModel = ModelExporter.export(sparkModel, df);
        final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

        //compare predictions
        Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaledOutput").collect();
        assertCorrectness(sparkOutput, precomputedAns, transformer);
    }
}
 
Example #17
Source File: KMeansClusteringMlib.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main( String[] args ){
	SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("K-means Example");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load and parse data
    String path = "data/km-data.txt";
    JavaRDD<String> data = sc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++)
            values[i] = Double.parseDouble(sarray[i]);
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using KMeans
    int numClusters = 2;
    int numIterations = 20;
    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    double WSSSE = clusters.computeCost(parsedData.rdd());
    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
	
	
	
}
 
Example #18
Source File: KMeansEvalIT.java    From oryx with Apache License 2.0 4 votes vote down vote up
private static JavaRDD<Vector> getRddOfVectors() {
  List<double[]> points = Arrays.asList(new double[][] {
      {1.0, 0.0}, {2.0, -2.0}, {2.0, 0.0}, {-2.0, 0.0}, {-0.5, -1.0}, {-0.5, 1.0}
  });
  return getJavaSparkContext().parallelize(points).map(Vectors::dense);
}
 
Example #19
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public LabeledPoint call(LabeledPoint v1) throws Exception {
    return new LabeledPoint(v1.label(), Vectors.dense(v1.features().toArray()));
}
 
Example #20
Source File: SparkMLTrainingAndScoringOnline.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);                               

                JavaStreamingContext streamingContext = new JavaStreamingContext(conf,
                    new Duration(BATCH_DURATION_INTERVAL_MS));
                
                JavaInputDStream<ConsumerRecord<String, String>> meetupStream = 
                    KafkaUtils.createDirectStream(
                                streamingContext, 
				LocationStrategies.PreferConsistent(),
                                ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                    );

                JavaDStream<String> meetupStreamValues = 
		    meetupStream.map(v -> {                     
                        return v.value();
                    });

                // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn])
                // Where n is the number of features, y is a binary label, 
                // and n must be the same for train and test.
                // e.g. "(response, [group_lat, group_long])";
                JavaDStream<String> trainData = meetupStreamValues.map(e -> {
                        
                        JSONParser jsonParser = new JSONParser();
                        JSONObject json = (JSONObject)jsonParser.parse(e);

                        String result = "(" 
                            + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") 
                            + ((JSONObject)json.get("group")).get("group_lat") + "," 
                            + ((JSONObject)json.get("group")).get("group_lon")
                            + "])";
                        
                        return result;
                });

                trainData.print();

                JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse);
        
                StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD 
			= new StreamingLogisticRegressionWithSGD()
                            .setInitialWeights(Vectors.zeros(2));

                streamingLogisticRegressionWithSGD.trainOn(labeledPoints);

                JavaPairDStream<Double, Vector> values = 
			labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features()));

                streamingLogisticRegressionWithSGD.predictOnValues(values).print();

                // some time later, after outputs have completed
                meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
                    OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

                ((CanCommitOffsets) meetupStream.inputDStream())
                    .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
                });

                streamingContext.start();
                streamingContext.awaitTermination();
        }
 
Example #21
Source File: StandardScalerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testStandardScaler() {
    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);


    //train model in spark
    StandardScalerModel sparkModelNone = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(false)
            .setWithStd(false)
            .fit(df);

    StandardScalerModel sparkModelWithMean = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(true)
            .setWithStd(false)
            .fit(df);

    StandardScalerModel sparkModelWithStd = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(false)
            .setWithStd(true)
            .fit(df);

    StandardScalerModel sparkModelWithBoth = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(true)
            .setWithStd(true)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModelNone, df);
    final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithMean, df);
    final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithStd, df);
    final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithBoth, df);
    final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel);


    //compare predictions
    Row[] sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkNoneOutput, data, transformerNone);

    Row[] sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean);

    Row[] sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd);

    Row[] sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth);

}
 
Example #22
Source File: JavaLinearRegressionWithSGDExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample");
  JavaSparkContext sc = new JavaSparkContext(conf);

  // $example on$
  // Load and parse the data
  String path = "data/mllib/ridge-data/lpsa.data";
  JavaRDD<String> data = sc.textFile(path);
  JavaRDD<LabeledPoint> parsedData = data.map(
    new Function<String, LabeledPoint>() {
      public LabeledPoint call(String line) {
        String[] parts = line.split(",");
        String[] features = parts[1].split(" ");
        double[] v = new double[features.length];
        for (int i = 0; i < features.length - 1; i++) {
          v[i] = Double.parseDouble(features[i]);
        }
        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
      }
    }
  );
  parsedData.cache();

  // Building the model
  int numIterations = 100;
  double stepSize = 0.00000001;
  final LinearRegressionModel model =
    LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);

  // Evaluate model on training examples and compute training error
  JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
    new Function<LabeledPoint, Tuple2<Double, Double>>() {
      public Tuple2<Double, Double> call(LabeledPoint point) {
        double prediction = model.predict(point.features());
        return new Tuple2<>(prediction, point.label());
      }
    }
  );
  double MSE = new JavaDoubleRDD(valuesAndPreds.map(
    new Function<Tuple2<Double, Double>, Object>() {
      public Object call(Tuple2<Double, Double> pair) {
        return Math.pow(pair._1() - pair._2(), 2.0);
      }
    }
  ).rdd()).mean();
  System.out.println("training Mean Squared Error = " + MSE);

  // Save and load model
  model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel");
  LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
    "target/tmp/javaLinearRegressionWithSGDModel");
  // $example off$

  sc.stop();
}
 
Example #23
Source File: JavaKMeansExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKMeansExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/kmeans_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using KMeans
    int numClusters = 2;
    int numIterations = 20;
    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);

    System.out.println("Cluster centers:");
    for (Vector center: clusters.clusterCenters()) {
      System.out.println(" " + center);
    }
    double cost = clusters.computeCost(parsedData.rdd());
    System.out.println("Cost: " + cost);

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    double WSSSE = clusters.computeCost(parsedData.rdd());
    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);

    // Save and load model
    clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel");
    KMeansModel sameModel = KMeansModel.load(jsc.sc(),
      "target/org/apache/spark/JavaKMeansExample/KMeansModel");
    // $example off$

    jsc.stop();
  }
 
Example #24
Source File: JavaRegressionMetricsExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Java Regression Metrics Example");
  JavaSparkContext sc = new JavaSparkContext(conf);
  // $example on$
  // Load and parse the data
  String path = "data/mllib/sample_linear_regression_data.txt";
  JavaRDD<String> data = sc.textFile(path);
  JavaRDD<LabeledPoint> parsedData = data.map(
    new Function<String, LabeledPoint>() {
      public LabeledPoint call(String line) {
        String[] parts = line.split(" ");
        double[] v = new double[parts.length - 1];
        for (int i = 1; i < parts.length - 1; i++) {
          v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
        }
        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
      }
    }
  );
  parsedData.cache();

  // Building the model
  int numIterations = 100;
  final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData),
    numIterations);

  // Evaluate model on training examples and compute training error
  JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint point) {
        double prediction = model.predict(point.features());
        return new Tuple2<Object, Object>(prediction, point.label());
      }
    }
  );

  // Instantiate metrics object
  RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd());

  // Squared error
  System.out.format("MSE = %f\n", metrics.meanSquaredError());
  System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError());

  // R-squared
  System.out.format("R Squared = %f\n", metrics.r2());

  // Mean absolute error
  System.out.format("MAE = %f\n", metrics.meanAbsoluteError());

  // Explained variance
  System.out.format("Explained Variance = %f\n", metrics.explainedVariance());

  // Save and load model
  model.save(sc.sc(), "target/tmp/LogisticRegressionModel");
  LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
    "target/tmp/LogisticRegressionModel");
  // $example off$

  sc.stop();
}
 
Example #25
Source File: JavaHypothesisTestingExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // a vector composed of the frequencies of events
    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);

    // compute the goodness of fit. If a second vector to test against is not supplied
    // as a parameter, the test runs against a uniform distribution.
    ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
    // summary of the test including the p-value, degrees of freedom, test statistic,
    // the method used, and the null hypothesis.
    System.out.println(goodnessOfFitTestResult + "\n");

    // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});

    // conduct Pearson's independence test on the input contingency matrix
    ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
    // summary of the test including the p-value, degrees of freedom...
    System.out.println(independenceTestResult + "\n");

    // an RDD of labeled points
    JavaRDD<LabeledPoint> obs = jsc.parallelize(
      Arrays.asList(
        new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
        new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
        new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
      )
    );

    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
    // against the label.
    ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
    int i = 1;
    for (ChiSqTestResult result : featureTestResults) {
      System.out.println("Column " + i + ":");
      System.out.println(result + "\n");  // summary of the test
      i++;
    }
    // $example off$

    jsc.stop();
  }
 
Example #26
Source File: JavaLatentDirichletAllocationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }
 
Example #27
Source File: LinearRegressionMlib.java    From Java-Data-Science-Cookbook with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Linear Regression Example");
	JavaSparkContext sparkContext = new JavaSparkContext(configuration);

	// Load and parse the data
	String inputData = "data/lr-data.txt";
	JavaRDD<String> data = sparkContext.textFile(inputData);
	JavaRDD<LabeledPoint> parsedData = data.map(
			new Function<String, LabeledPoint>() {
				public LabeledPoint call(String line) {
					String[] parts = line.split(",");
					String[] features = parts[1].split(" ");
					double[] featureVector = new double[features.length];
					for (int i = 0; i < features.length - 1; i++){
						featureVector[i] = Double.parseDouble(features[i]);
					}
					return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(featureVector));
				}
			}
	);
	parsedData.cache();

	// Building the model
	int numIterations = 100;
	final LinearRegressionModel model = 
			LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);

	// Evaluate model on training examples and compute training error
	JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
			new Function<LabeledPoint, Tuple2<Double, Double>>() {
				public Tuple2<Double, Double> call(LabeledPoint point) {
					double prediction = model.predict(point.features());
					return new Tuple2<Double, Double>(prediction, point.label());
				}
			}
	);
	double MSE = new JavaDoubleRDD(valuesAndPreds.map(
			new Function<Tuple2<Double, Double>, Object>() {
				public Object call(Tuple2<Double, Double> pair) {
					return Math.pow(pair._1() - pair._2(), 2.0);
				}
			}
	).rdd()).mean();
	System.out.println("training Mean Squared Error = " + MSE);
}