org.apache.spark.ml.tuning.CrossValidator Java Examples

The following examples show how to use org.apache.spark.ml.tuning.CrossValidator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkClassificationModel.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 *
 * @param trainData
 * @param trainingSettings
 * @return
 */
public CrossValidatorModel crossValidate(DataFrame trainData, TrainingSettings trainingSettings) {

    //First create the pipeline and the ParamGrid
    createPipeline(trainData, trainingSettings);

    // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
    // This will allow us to jointly choose parameters for all Pipeline stages.
    // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    CrossValidator cv = new CrossValidator()
            .setEstimator(pipeline)
            .setEvaluator(evaluator)
            .setEstimatorParamMaps(paramGrid)
            .setNumFolds(trainingSettings.getNumFolds());

    if(classificationMethod.equals(TrainingSettings.ClassificationMethod.LOG_REG)) {
        long numPositive = trainData.filter(col("label").equalTo("1.0")).count();
        long datasetSize = trainData.count();
        double balancingRatio = (double)(datasetSize - numPositive) / datasetSize;

        trainData = trainData
                .withColumn("classWeightCol",
                        when(col("label").equalTo("1.0"), 1* balancingRatio)
                                .otherwise((1 * (1.0 - balancingRatio))));
    }
    // Run cross-validation, and choose the best set of parameters.
    bestModel = cv.fit(trainData);
    System.out.println("IS LARGER BETTER ?"+bestModel.getEvaluator().isLargerBetter());
    return bestModel;
}
 
Example #2
Source File: BikeRentalPrediction.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	SparkSession sparkSession = SparkSession
			.builder()
			.master("local")
			.config("spark.sql.warehouse.dir",
					"file:///E:/sumitK/Hadoop/warehouse")
			.appName("BikeRentalPrediction").getOrCreate();
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN);
	//We use the sqlContext.read method to read the data and set a few options:
	//  'format': specifies the Spark CSV data source
	//  'header': set to true to indicate that the first line of the CSV data file is a header
    // The file is called 'hour.csv'.	
	Dataset<Row> ds=sparkSession.read()
			  .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
			  .option("header", "true")
			  .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv");
	
	ds.cache();
	
	ds.select("season").show();;
	
	ds.show();
	
	System.out.println("Our dataset has rows :: "+ ds.count());
	
	Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered");
	df.printSchema();
	//col("...") is preferable to df.col("...")
	Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType),
			                            col("yr").cast(DataTypes.IntegerType),
										col("mnth").cast(DataTypes.IntegerType),
										col("hr").cast(DataTypes.IntegerType),
										col("holiday").cast(DataTypes.IntegerType),
										col("weekday").cast(DataTypes.IntegerType),
										col("workingday").cast(DataTypes.IntegerType),
										col("weathersit").cast(DataTypes.IntegerType),
										col("temp").cast(DataTypes.IntegerType),
										col("atemp").cast(DataTypes.IntegerType),
										col("hum").cast(DataTypes.IntegerType),
										col("windspeed").cast(DataTypes.IntegerType),
										col("cnt").cast(DataTypes.IntegerType));
	
	
dformatted.printSchema();	
Dataset<Row>[] data=	dformatted.randomSplit(new double[]{0.7,0.3});
System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count());

///
//removing 'cnt' cloumn and then forming str array
String[] featuresCols = dformatted.drop("cnt").columns();

for(String str:featuresCols){
	System.out.println(str+" :: ");
}

//This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures");
//This identifies categorical features and indexes them.
VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4);

//Takes the "features" column and learns to predict "cnt"
GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt");
		
// Define a grid of hyperparameters to test:
  //  - maxDepth: max depth of each decision tree in the GBT ensemble
//  - maxIter: iterations, i.e., number of trees in each GBT ensemble
// In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
ParamMap[]	paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build();
// We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol());

//	# Declare the CrossValidator, which runs model tuning for us.
	CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid);
		
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv});
			
	PipelineModel pipelineModel=pipeline.fit(data[0]);
	
	Dataset<Row> predictions = pipelineModel.transform(data[1]);
	
	predictions.show();
	//predictions.select("cnt", "prediction", *featuresCols);
}
 
Example #3
Source File: JavaModelSelectionViaCrossValidationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaModelSelectionViaCrossValidationExample")
    .getOrCreate();

  // $example on$
  // Prepare training documents, which are labeled.
  Dataset<Row> training = spark.createDataFrame(Arrays.asList(
    new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
    new JavaLabeledDocument(1L, "b d", 0.0),
    new JavaLabeledDocument(2L,"spark f g h", 1.0),
    new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0),
    new JavaLabeledDocument(4L, "b spark who", 1.0),
    new JavaLabeledDocument(5L, "g d a y", 0.0),
    new JavaLabeledDocument(6L, "spark fly", 1.0),
    new JavaLabeledDocument(7L, "was mapreduce", 0.0),
    new JavaLabeledDocument(8L, "e spark program", 1.0),
    new JavaLabeledDocument(9L, "a e c l", 0.0),
    new JavaLabeledDocument(10L, "spark compile", 1.0),
    new JavaLabeledDocument(11L, "hadoop software", 0.0)
  ), JavaLabeledDocument.class);

  // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
  Tokenizer tokenizer = new Tokenizer()
    .setInputCol("text")
    .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol())
    .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.01);
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  // We use a ParamGridBuilder to construct a grid of parameters to search over.
  // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
  // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
  ParamMap[] paramGrid = new ParamGridBuilder()
    .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000})
    .addGrid(lr.regParam(), new double[] {0.1, 0.01})
    .build();

  // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
  // This will allow us to jointly choose parameters for all Pipeline stages.
  // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
  // Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
  // is areaUnderROC.
  CrossValidator cv = new CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(new BinaryClassificationEvaluator())
    .setEstimatorParamMaps(paramGrid).setNumFolds(2);  // Use 3+ in practice

  // Run cross-validation, and choose the best set of parameters.
  CrossValidatorModel cvModel = cv.fit(training);

  // Prepare test documents, which are unlabeled.
  Dataset<Row> test = spark.createDataFrame(Arrays.asList(
    new JavaDocument(4L, "spark i j k"),
    new JavaDocument(5L, "l m n"),
    new JavaDocument(6L, "mapreduce spark"),
    new JavaDocument(7L, "apache hadoop")
  ), JavaDocument.class);

  // Make predictions on test documents. cvModel uses the best model found (lrModel).
  Dataset<Row> predictions = cvModel.transform(test);
  for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}