org.apache.spark.ml.feature.StandardScaler Java Examples

The following examples show how to use org.apache.spark.ml.feature.StandardScaler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaStandardScalerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStandardScalerExample")
    .getOrCreate();

  // $example on$
  Dataset<Row> dataFrame =
    spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  StandardScaler scaler = new StandardScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures")
    .setWithStd(true)
    .setWithMean(false);

  // Compute summary statistics by fitting the StandardScaler
  StandardScalerModel scalerModel = scaler.fit(dataFrame);

  // Normalize each feature to have unit standard deviation.
  Dataset<Row> scaledData = scalerModel.transform(dataFrame);
  scaledData.show();
  // $example off$
  spark.stop();
}
 
Example #2
Source File: StandardScalerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testStandardScaler() {


    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(1.0, Vectors.dense(data[0])),
            RowFactory.create(2.0, Vectors.dense(data[1])),
            RowFactory.create(3.0, Vectors.dense(data[2]))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(jrdd, schema);

    //train model in spark
    StandardScalerModel sparkModelNone = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(false)
            .setWithStd(false)
            .fit(df);

    StandardScalerModel sparkModelWithMean = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(true)
            .setWithStd(false)
            .fit(df);

    StandardScalerModel sparkModelWithStd = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(false)
            .setWithStd(true)
            .fit(df);

    StandardScalerModel sparkModelWithBoth = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(true)
            .setWithStd(true)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModelNone);
    final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithMean);
    final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithStd);
    final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithBoth);
    final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel);


    //compare predictions
    List<Row> sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
    assertCorrectness(sparkNoneOutput, data, transformerNone);

    List<Row> sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
    assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean);

    List<Row> sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
    assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd);

    List<Row> sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
    assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth);

}
 
Example #3
Source File: StandardScalerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testStandardScaler() {
    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);


    //train model in spark
    StandardScalerModel sparkModelNone = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(false)
            .setWithStd(false)
            .fit(df);

    StandardScalerModel sparkModelWithMean = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(true)
            .setWithStd(false)
            .fit(df);

    StandardScalerModel sparkModelWithStd = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(false)
            .setWithStd(true)
            .fit(df);

    StandardScalerModel sparkModelWithBoth = new StandardScaler()
            .setInputCol("features")
            .setOutputCol("scaledOutput")
            .setWithMean(true)
            .setWithStd(true)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModelNone, df);
    final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithMean, df);
    final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithStd, df);
    final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel);

    exportedModel = ModelExporter.export(sparkModelWithBoth, df);
    final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel);


    //compare predictions
    Row[] sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkNoneOutput, data, transformerNone);

    Row[] sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean);

    Row[] sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd);

    Row[] sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collect();
    assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth);

}