org.apache.spark.ml.linalg.Vectors Java Examples

The following examples show how to use org.apache.spark.ml.linalg.Vectors. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #2
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #3
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #4
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #5
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #6
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");

	List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #7
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #8
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column, no format specified");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #9
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");

	List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #10
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #11
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #12
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #13
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #14
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #15
Source File: JavaAFTSurvivalRegressionExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaAFTSurvivalRegressionExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)),
    RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)),
    RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)),
    RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)),
    RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> training = spark.createDataFrame(data, schema);
  double[] quantileProbabilities = new double[]{0.3, 0.6};
  AFTSurvivalRegression aft = new AFTSurvivalRegression()
    .setQuantileProbabilities(quantileProbabilities)
    .setQuantilesCol("quantiles");

  AFTSurvivalRegressionModel model = aft.fit(training);

  // Print the coefficients, intercept and scale parameter for AFT survival regression
  System.out.println("Coefficients: " + model.coefficients());
  System.out.println("Intercept: " + model.intercept());
  System.out.println("Scale: " + model.scale());
  model.transform(training).show(false);
  // $example off$

  spark.stop();
}
 
Example #16
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
private static Vector createVector(MatrixBlock row) {
	if( row.isEmptyBlock(false) ) //EMPTY SPARSE ROW
		return Vectors.sparse(row.getNumColumns(), new int[0], new double[0]);
	else if( row.isInSparseFormat() ) //SPARSE ROW
		return Vectors.sparse(row.getNumColumns(), 
				row.getSparseBlock().indexes(0), row.getSparseBlock().values(0));
	else // DENSE ROW
		return Vectors.dense(row.getDenseBlockValues());
}
 
Example #17
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
private static Vector createVector(MatrixBlock row) {
	if( row.isEmptyBlock(false) ) //EMPTY SPARSE ROW
		return Vectors.sparse(row.getNumColumns(), new int[0], new double[0]);
	else if( row.isInSparseFormat() ) //SPARSE ROW
		return Vectors.sparse(row.getNumColumns(), 
				row.getSparseBlock().indexes(0), row.getSparseBlock().values(0));
	else // DENSE ROW
		return Vectors.dense(row.getDenseBlockValues());
}
 
Example #18
Source File: MinMaxScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testMinMaxScaler() {
    //prepare data
    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(1.0, Vectors.dense(data[0])),
            RowFactory.create(2.0, Vectors.dense(data[1])),
            RowFactory.create(3.0, Vectors.dense(data[2])),
            RowFactory.create(4.0, Vectors.dense(data[3]))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(jrdd, schema);

    //train model in spark
    MinMaxScalerModel sparkModel = new MinMaxScaler()
            .setInputCol("features")
            .setOutputCol("scaled")
            .setMin(-5)
            .setMax(5)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModel);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    List<Row> sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collectAsList();
    assertCorrectness(sparkOutput, expected, transformer);
}
 
Example #19
Source File: SimplePredictionFromTextFile.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Simple prediction from Text File").master("local").getOrCreate();

  spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());

  String filename = "data/tuple-data-file.csv";
  StructType schema = new StructType(
      new StructField[] { new StructField("_c0", DataTypes.DoubleType, false,
          Metadata.empty()),
          new StructField("_c1", DataTypes.DoubleType, false, Metadata
              .empty()),
          new StructField("features", new VectorUDT(), true, Metadata
              .empty()), });

  Dataset<Row> df = spark.read().format("csv").schema(schema).option("header",
      "false")
      .load(filename);
  df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
  df = df.withColumn("label", df.col("_c1")).drop("_c1");
  df.printSchema();

  df = df.withColumn("features", callUDF("vectorBuilder", df.col(
      "valuefeatures")));
  df.printSchema();
  df.show();

  LinearRegression lr = new LinearRegression().setMaxIter(20);// .setRegParam(1).setElasticNetParam(1);

  // Fit the model to the data.
  LinearRegressionModel model = lr.fit(df);

  // Given a dataset, predict each point's label, and show the results.
  model.transform(df).show();

  LinearRegressionTrainingSummary trainingSummary = model.summary();
  System.out.println("numIterations: " + trainingSummary.totalIterations());
  System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary
      .objectiveHistory()));
  trainingSummary.residuals().show();
  System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
  System.out.println("r2: " + trainingSummary.r2());

  double intercept = model.intercept();
  System.out.println("Interesection: " + intercept);
  double regParam = model.getRegParam();
  System.out.println("Regression parameter: " + regParam);
  double tol = model.getTol();
  System.out.println("Tol: " + tol);
  Double feature = 7.0;
  Vector features = Vectors.dense(feature);
  double p = model.predict(features);

  System.out.println("Prediction for feature " + feature + " is " + p);
  System.out.println(8 * regParam + intercept);
}
 
Example #20
Source File: JavaElementwiseProductExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaElementwiseProductExample")
    .getOrCreate();

  // $example on$
  // Create some vector data; also works for sparse vectors
  List<Row> data = Arrays.asList(
    RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
    RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
  );

  List<StructField> fields = new ArrayList<StructField>(2);
  fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
  fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));

  StructType schema = DataTypes.createStructType(fields);

  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);

  ElementwiseProduct transformer = new ElementwiseProduct()
    .setScalingVec(transformingVector)
    .setInputCol("vector")
    .setOutputCol("transformedVector");

  // Batch transform the vectors to create new column:
  transformer.transform(dataFrame).show();
  // $example off$
  spark.stop();
}
 
Example #21
Source File: JavaPolynomialExpansionExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPolynomialExpansionExample")
    .getOrCreate();

  // $example on$
  PolynomialExpansion polyExpansion = new PolynomialExpansion()
    .setInputCol("features")
    .setOutputCol("polyFeatures")
    .setDegree(3);

  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.dense(2.0, 1.0)),
    RowFactory.create(Vectors.dense(0.0, 0.0)),
    RowFactory.create(Vectors.dense(3.0, -1.0))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  Dataset<Row> polyDF = polyExpansion.transform(df);
  polyDF.show(false);
  // $example off$

  spark.stop();
}
 
Example #22
Source File: JavaPCAExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPCAExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
    RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
    RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  PCAModel pca = new PCA()
    .setInputCol("features")
    .setOutputCol("pcaFeatures")
    .setK(3)
    .fit(df);

  Dataset<Row> result = pca.transform(df).select("pcaFeatures");
  result.show(false);
  // $example off$
  spark.stop();
}
 
Example #23
Source File: JavaDCTExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDCTExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
    RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
    RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  DCT dct = new DCT()
    .setInputCol("features")
    .setOutputCol("featuresDCT")
    .setInverse(false);

  Dataset<Row> dctDf = dct.transform(df);

  dctDf.select("featuresDCT").show(false);
  // $example off$

  spark.stop();
}
 
Example #24
Source File: ProteinSequenceEncoder.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * One-hot encodes a protein sequence. The one-hot encoding
    * encodes the 20 natural amino acids, plus X for any other 
    * residue for a total of 21 elements per residue.
 * 
 * @return dataset with feature vector appended
 */
public Dataset<Row> oneHotEncode() {
	SparkSession session = data.sparkSession();
	int maxLength = getMaxSequenceLength(data);

	session.udf().register("encoder", new UDF1<String, Vector>() {
		private static final long serialVersionUID = -6095318836772114908L;

		@Override
		public Vector call(String s) throws Exception {
			int len = AMINO_ACIDS21.size();
               double[] values = new double[len * maxLength];
			char[] seq = s.toCharArray();
			for (int i = 0; i < seq.length; i++) {
				int index = AMINO_ACIDS21.indexOf(seq[i]);
				// replace any non-matching code, e.g., U, with X
				if (index == -1) {
					index = AMINO_ACIDS21.indexOf('X');
				}
				values[i * len + index] = 1;
			}

			return Vectors.dense(values);
		}
	}, new VectorUDT());

	// append feature column
	data.createOrReplaceTempView("table");
	data = session.sql("SELECT *, encoder(" 
	+ inputCol + ") AS " 
			+ outputCol + " from table");
	
	return data;
}
 
Example #25
Source File: ProteinSequenceEncoder.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Encodes a protein sequence by 7 physicochemical
 * properties. 
 * 
 * <p> See:  Meiler, J., Müller, M., Zeidler, A. et al. J Mol Model (2001) 7: 360. doi:
 * <a href="https://link.springer.com/article/10.1007/s008940100038">10.1007/s008940100038</a>
    *
 * @return dataset with feature vector appended
 */
public Dataset<Row> propertyEncode() {
	SparkSession session = data.sparkSession();
    int maxLength = getMaxSequenceLength(data);

	session.udf().register("encoder", new UDF1<String, Vector>(){
		private static final long serialVersionUID = 1L;

		@Override
		public Vector call(String s) throws Exception {
               double[] values = new double[7*maxLength];
			for (int i = 0, k = 0; i < s.length(); i++) {
				double[] property = properties.get(s.charAt(i));
				if (property != null) {
					for (double p: property) {
						values[k++] = p;
					}
				}	
			}
			return Vectors.dense(values);
		}
	}, new VectorUDT());

	// append feature column
			data.createOrReplaceTempView("table");
			data = session.sql("SELECT *, encoder(" 
			+ inputCol + ") AS " 
					+ outputCol + " from table");
			
			return data;
}
 
Example #26
Source File: ProteinSequenceEncoder.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Encodes a protein sequence by a Blosum62 matrix.
 * 
 * <p> See: <a href="https://ftp.ncbi.nih.gov/repository/blocks/unix/blosum/BLOSUM/blosum62.blast.new">BLOSUM62 Matrix</a>
    *
 * @return dataset with feature vector appended
 */
public Dataset<Row> blosum62Encode() {
	SparkSession session = data.sparkSession();
    int maxLength = getMaxSequenceLength(data);

	session.udf().register("encoder", new UDF1<String, Vector>(){
		private static final long serialVersionUID = 1L;

		@Override
		public Vector call(String s) throws Exception {
			double[] values = new double[20*maxLength];
			for (int i = 0, k = 0; i < s.length(); i++) {
				double[] property = blosum62.get(s.charAt(i));
				if (property != null) {
					for (double p: property) {
						values[k++] = p;
					}
				}	
			}
			return Vectors.dense(values);
		}
	}, new VectorUDT());

	// append feature column
			data.createOrReplaceTempView("table");
			data = session.sql("SELECT *, encoder(" 
			+ inputCol + ") AS " 
					+ outputCol + " from table");
			
			return data;
}
 
Example #27
Source File: ProteinSequenceEncoder.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
private static Dataset<Row> averageFeatureVectors(Dataset<Row> data, String outputCol) {
	SparkSession session = data.sparkSession();

	session.udf().register("averager", new UDF3<Vector, Vector, Vector, Vector>() {
		private static final long serialVersionUID = -8190379199020903671L;

		@Override
		public Vector call(Vector v1, Vector v2, Vector v3) throws Exception {
			double[] f1 = v1.toArray();
			double[] f2 = v2.toArray();
			double[] f3 = v3.toArray();
			
			// arrays may be of different length
			int len = Math.min(Math.min(f1.length, f2.length), f3.length);
			double[] average = new double[len];

			for (int i = 0; i < len; i++) {
				average[i] = (f1[i] + f2[i] + f3[i]) / 3.0;
			}
			return Vectors.dense(average);
		}
	}, new VectorUDT());

	data.createOrReplaceTempView("table");
	// append new feature column with average values
	return session.sql("SELECT *, averager(features0,features1,features2) AS " + outputCol + " from table");
}
 
Example #28
Source File: JavaVectorAssemblerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example #29
Source File: JavaMinHashLSHExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinHashLSHExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("keys", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinHashLSH mh = new MinHashLSH()
    .setNumHashTables(1)
    .setInputCol("keys")
    .setOutputCol("values");

  MinHashLSHModel model = mh.fit(dataFrame);
  model.transform(dataFrame).show();
  // $example off$

  spark.stop();
}
 
Example #30
Source File: JavaNormalizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNormalizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
      RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  // Normalize each Vector using $L^1$ norm.
  Normalizer normalizer = new Normalizer()
    .setInputCol("features")
    .setOutputCol("normFeatures")
    .setP(1.0);

  Dataset<Row> l1NormData = normalizer.transform(dataFrame);
  l1NormData.show();

  // Normalize each Vector using $L^\infty$ norm.
  Dataset<Row> lInfNormData =
    normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
  lInfNormData.show();
  // $example off$

  spark.stop();
}