org.apache.spark.ml.regression.LinearRegression Java Examples

The following examples show how to use org.apache.spark.ml.regression.LinearRegression. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkMLHouses.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 5 votes vote down vote up
public static void main(String[] args) throws InterruptedException, StreamingQueryException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                // * the schema can be written on disk, and read from disk
                // * the schema is not mandatory to be complete, it can contain only the needed fields    
                StructType HOUSES_SCHEMA = 
                       new StructType()
                           .add("House", LongType, true)
                           .add("Taxes", LongType, true)
                           .add("Bedrooms", LongType, true)
                           .add("Baths", FloatType, true)
                           .add("Quadrant", LongType, true)
                           .add("NW", StringType, true)
                           .add("Price($)", LongType, false)
                           .add("Size(sqft)", LongType, false)
                           .add("lot", LongType, true);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);

                SparkSession sparkSession = SparkSession.builder()
                    .config(conf)
                    .getOrCreate();

                Dataset<Row> housesDF = sparkSession.read()
                     .schema(HOUSES_SCHEMA)
                     .json(HOUSES_FILE_PATH);
             
                // Gathering Data				
                Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), 
                    col("Bedrooms"), col("Baths"),
                    col("Size(sqft)"), col("Price($)"));
                
                // Data Preparation  
                Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label");
                
                Imputer imputer = new Imputer()
                    // .setMissingValue(1.0d)
                    .setInputCols(new String[] { "Baths" })
                    .setOutputCols(new String[] { "~Baths~" });

                VectorAssembler assembler = new VectorAssembler()
                    .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" })
                    .setOutputCol("features");
                
                // Choosing a Model               
                LinearRegression linearRegression = new LinearRegression();
                linearRegression.setMaxIter(1000);

                Pipeline pipeline = new Pipeline()
                                .setStages(new PipelineStage[] {
                                    imputer, assembler, linearRegression 
                                });

                // Training The Data
                Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 });

                Dataset<Row> trainDF = splitDF[0];
                Dataset<Row> evaluationDF = splitDF[1];

                PipelineModel pipelineModel = pipeline.fit(trainDF);
                
                // Evaluation 
                Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF);

                predictionsDF.show(false);

                Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), 
                    col("prediction"));

                RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2");
                RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse");

                double r2 = evaluteR2.evaluate(forEvaluationDF);
                double rmse = evaluteRMSE.evaluate(forEvaluationDF);

                logger.info("---------------------------");
                logger.info("R2 =" + r2);
                logger.info("RMSE =" + rmse);
                logger.info("---------------------------");
        }
 
Example #2
Source File: JavaLinearRegressionWithElasticNetExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLinearRegressionWithElasticNetExample")
    .getOrCreate();

  // $example on$
  // Load training data.
  Dataset<Row> training = spark.read().format("libsvm")
    .load("data/mllib/sample_linear_regression_data.txt");

  LinearRegression lr = new LinearRegression()
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8);

  // Fit the model.
  LinearRegressionModel lrModel = lr.fit(training);

  // Print the coefficients and intercept for linear regression.
  System.out.println("Coefficients: "
    + lrModel.coefficients() + " Intercept: " + lrModel.intercept());

  // Summarize the model over the training set and print out some metrics.
  LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
  System.out.println("numIterations: " + trainingSummary.totalIterations());
  System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
  trainingSummary.residuals().show();
  System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
  System.out.println("r2: " + trainingSummary.r2());
  // $example off$

  spark.stop();
}
 
Example #3
Source File: SimplePredictionFromTextFile.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Simple prediction from Text File").master("local").getOrCreate();

  spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());

  String filename = "data/tuple-data-file.csv";
  StructType schema = new StructType(
      new StructField[] { new StructField("_c0", DataTypes.DoubleType, false,
          Metadata.empty()),
          new StructField("_c1", DataTypes.DoubleType, false, Metadata
              .empty()),
          new StructField("features", new VectorUDT(), true, Metadata
              .empty()), });

  Dataset<Row> df = spark.read().format("csv").schema(schema).option("header",
      "false")
      .load(filename);
  df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
  df = df.withColumn("label", df.col("_c1")).drop("_c1");
  df.printSchema();

  df = df.withColumn("features", callUDF("vectorBuilder", df.col(
      "valuefeatures")));
  df.printSchema();
  df.show();

  LinearRegression lr = new LinearRegression().setMaxIter(20);// .setRegParam(1).setElasticNetParam(1);

  // Fit the model to the data.
  LinearRegressionModel model = lr.fit(df);

  // Given a dataset, predict each point's label, and show the results.
  model.transform(df).show();

  LinearRegressionTrainingSummary trainingSummary = model.summary();
  System.out.println("numIterations: " + trainingSummary.totalIterations());
  System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary
      .objectiveHistory()));
  trainingSummary.residuals().show();
  System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
  System.out.println("r2: " + trainingSummary.r2());

  double intercept = model.intercept();
  System.out.println("Interesection: " + intercept);
  double regParam = model.getRegParam();
  System.out.println("Regression parameter: " + regParam);
  double tol = model.getTol();
  System.out.println("Tol: " + tol);
  Double feature = 7.0;
  Vector features = Vectors.dense(feature);
  double p = model.predict(features);

  System.out.println("Prediction for feature " + feature + " is " + p);
  System.out.println(8 * regParam + intercept);
}
 
Example #4
Source File: DatasetRegressor.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] path to parquet file, args[1] name of the prediction column
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	if (args.length != 2) {
		System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>");
		System.exit(1);
	}

	// name of the prediction column
	String label = args[1];
	
	long start = System.nanoTime();

	SparkSession spark = SparkSession
			.builder()
			.master("local[*]")
			.appName(DatasetRegressor.class.getSimpleName())
			.getOrCreate();

	Dataset<Row> data = spark.read().parquet(args[0]).cache();
	
	int featureCount = ((DenseVector)data.first().getAs("features")).numActives();
	System.out.println("Feature count: "  + featureCount);

	System.out.println("Dataset size : " + data.count());

	double testFraction = 0.3;
	long seed = 123;

	LinearRegression lr = new LinearRegression()
			.setLabelCol(label)
			.setFeaturesCol("features");
	
	SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed);
	System.out.println(reg.fit(data));
	
	GBTRegressor gbt = new GBTRegressor()
			.setLabelCol(label)
			.setFeaturesCol("features");
	
	reg = new SparkRegressor(gbt, label, testFraction, seed);
	System.out.println(reg.fit(data));
	
	GeneralizedLinearRegression glr = new GeneralizedLinearRegression()
			.setLabelCol(label)
			.setFeaturesCol("features")
			  .setFamily("gaussian")
			  .setLink("identity")
			  .setMaxIter(10)
			  .setRegParam(0.3);
	
	reg = new SparkRegressor(glr, label, testFraction, seed);
	System.out.println(reg.fit(data));
	
	
	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
}
 
Example #5
Source File: JavaModelSelectionViaTrainValidationSplitExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaModelSelectionViaTrainValidationSplitExample")
    .getOrCreate();

  // $example on$
  Dataset<Row> data = spark.read().format("libsvm")
    .load("data/mllib/sample_linear_regression_data.txt");

  // Prepare training and test data.
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);
  Dataset<Row> training = splits[0];
  Dataset<Row> test = splits[1];

  LinearRegression lr = new LinearRegression();

  // We use a ParamGridBuilder to construct a grid of parameters to search over.
  // TrainValidationSplit will try all combinations of values and determine best model using
  // the evaluator.
  ParamMap[] paramGrid = new ParamGridBuilder()
    .addGrid(lr.regParam(), new double[] {0.1, 0.01})
    .addGrid(lr.fitIntercept())
    .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
    .build();

  // In this case the estimator is simply the linear regression.
  // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
  TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
    .setEstimator(lr)
    .setEvaluator(new RegressionEvaluator())
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8);  // 80% for training and the remaining 20% for validation

  // Run train validation split, and choose the best set of parameters.
  TrainValidationSplitModel model = trainValidationSplit.fit(training);

  // Make predictions on test data. model is the model with combination of parameters
  // that performed best.
  model.transform(test)
    .select("features", "label", "prediction")
    .show();
  // $example off$

  spark.stop();
}