org.apache.spark.ml.classification.LogisticRegression Java Exaples

Source File: JavaMulticlassLogisticRegressionWithElasticNetExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
    SparkSession spark = SparkSession
            .builder()
            .appName("JavaMulticlassLogisticRegressionWithElasticNetExample")
            .getOrCreate();

    // $example on$
    // Load training data
    Dataset<Row> training = spark.read().format("libsvm")
            .load("data/mllib/sample_multiclass_classification_data.txt");

    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.3)
            .setElasticNetParam(0.8);

    // Fit the model
    LogisticRegressionModel lrModel = lr.fit(training);

    // Print the coefficients and intercept for multinomial logistic regression
    System.out.println("Coefficients: \n"
            + lrModel.coefficientMatrix() + " \nIntercept: " + lrModel.interceptVector());
    // $example off$

    spark.stop();
}

Source File: LogisticRegression1ExporterTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void shouldExportAndImportCorrectly() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON);
    assertEquals(lrmodel.getThreshold(), importedModel.getThreshold(), EPSILON);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON);

    assertEquals(lrmodel.getFeaturesCol(), importedModel.getInputKeys().iterator().next());
    assertEquals(lrmodel.getPredictionCol(), importedModel.getOutputKeys().iterator().next());
}

Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}

Source File: JavaLogisticRegressionWithElasticNetExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogisticRegressionWithElasticNetExample")
    .getOrCreate();

  // $example on$
  // Load training data
  Dataset<Row> training = spark.read().format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8);

  // Fit the model
  LogisticRegressionModel lrModel = lr.fit(training);

  // Print the coefficients and intercept for logistic regression
  System.out.println("Coefficients: "
    + lrModel.coefficients() + " Intercept: " + lrModel.intercept());

  // We can also use the multinomial family for binary classification
  LogisticRegression mlr = new LogisticRegression()
          .setMaxIter(10)
          .setRegParam(0.3)
          .setElasticNetParam(0.8)
          .setFamily("multinomial");

  // Fit the model
  LogisticRegressionModel mlrModel = mlr.fit(training);

  // Print the coefficients and intercepts for logistic regression with multinomial family
  System.out.println("Multinomial coefficients: " + lrModel.coefficientMatrix()
    + "\nMultinomial intercepts: " + mlrModel.interceptVector());
  // $example off$

  spark.stop();
}

Source File: LogisticRegression1ExporterTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void shouldExportAndImportCorrectly() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01);
    assertEquals(lrmodel.getThreshold(), importedModel.getThreshold(), 0.01);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.coefficients().toArray()[i], importedModel.getWeights()[i], 0.01);

    assertEquals(lrmodel.getFeaturesCol(), importedModel.getInputKeys().iterator().next());
    assertEquals(lrmodel.getPredictionCol(), importedModel.getOutputKeys().iterator().next());
}

Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features().asML();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}

Source File: TestSparkMLDeriver.java From envelope with Apache License 2.0

5 votes

private void generateAndSaveModel(String savePath) throws IOException {
  // Sourced from the Spark ML documentation and examples

  StructType trainingSchema = DataTypes.createStructType(Lists.newArrayList(
      DataTypes.createStructField("id", DataTypes.LongType, false),
      DataTypes.createStructField("text", DataTypes.StringType, false),
      DataTypes.createStructField("label", DataTypes.DoubleType, false)
  ));
  Dataset<Row> training = Contexts.getSparkSession().createDataFrame(Lists.newArrayList(
      RowFactory.create(0L, "a b c d e spark", 1.0),
      RowFactory.create(1L, "b d", 0.0),
      RowFactory.create(2L, "spark f g h", 1.0),
      RowFactory.create(3L, "hadoop mapreduce", 0.0)
  ), trainingSchema);

  Tokenizer tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol())
      .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.001);

  Pipeline pipeline = new Pipeline()
      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  PipelineModel model = pipeline.fit(training);

  model.write().overwrite().save(savePath);
}

Source File: JavaLogisticRegressionSummaryExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogisticRegressionSummaryExample")
    .getOrCreate();

  // Load training data
  Dataset<Row> training = spark.read().format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8);

  // Fit the model
  LogisticRegressionModel lrModel = lr.fit(training);

  // $example on$
  // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier
  // example
  LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();

  // Obtain the loss per iteration.
  double[] objectiveHistory = trainingSummary.objectiveHistory();
  for (double lossPerIteration : objectiveHistory) {
    System.out.println(lossPerIteration);
  }

  // Obtain the metrics useful to judge performance on test data.
  // We cast the summary to a BinaryLogisticRegressionSummary since the problem is a binary
  // classification problem.
  BinaryLogisticRegressionSummary binarySummary =
    (BinaryLogisticRegressionSummary) trainingSummary;

  // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
  Dataset<Row> roc = binarySummary.roc();
  roc.show();
  roc.select("FPR").show();
  System.out.println(binarySummary.areaUnderROC());

  // Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
  // this selected threshold.
  Dataset<Row> fMeasure = binarySummary.fMeasureByThreshold();
  double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0);
  double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure))
    .select("threshold").head().getDouble(0);
  lrModel.setThreshold(bestThreshold);
  // $example off$

  spark.stop();
}

Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    DataFrame testData = sqlContext.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON);
    }
}

Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    Dataset<Row> testData = spark.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01);
    }
}

Source File: WhitespaceClassifier.java From vn.vitk with GNU General Public License v3.0

4 votes

/**
 * Trains a whitespace classifier model and save the resulting pipeline model
 * to an external file. 
 * @param sentences a list of tokenized sentences.
 * @param pipelineModelFileName
 * @param numFeatures
 */
public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) {
	List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size());
	int id = 0;
	for (String sentence : sentences) {
		sentence = sentence.trim();
		for (int j = 0; j < sentence.length(); j++) {
			char c = sentence.charAt(j);
			if (c == ' ' || c == '_') {
				WhitespaceContext context = new WhitespaceContext();
				context.setId(id++);
				context.setContext(extractContext(sentence, j));
				context.setLabel(c == ' ' ? 0d : 1d);
				contexts.add(context);
			}
		}
	}
	JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts);
	DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class);
	df.show(false);
	System.out.println("N = " + df.count());
	df.groupBy("label").count().show();
	
	org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer()
			.setInputCol("context").setOutputCol("words");
	HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures)
			.setInputCol(tokenizer.getOutputCol()).setOutputCol("features");
	LogisticRegression lr = new LogisticRegression().setMaxIter(100)
			.setRegParam(0.01);
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {
			tokenizer, hashingTF, lr });
	model = pipeline.fit(df);
	
	try {
		model.write().overwrite().save(pipelineModelFileName);
	} catch (IOException e) {
		e.printStackTrace();
	}
	
	DataFrame predictions = model.transform(df);
	predictions.show();
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	double accuracy = evaluator.evaluate(predictions);
	System.out.println("training accuracy = " + accuracy);
	
	LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2];
	LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
	double[] objectiveHistory = trainingSummary.objectiveHistory();
	System.out.println("#(iterations) = " + objectiveHistory.length);
	for (double lossPerIteration : objectiveHistory) {
	  System.out.println(lossPerIteration);
	}
	
}

Source File: JavaEstimatorTransformerParamExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
   SparkSession spark = SparkSession
     .builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse")
     .appName("JavaEstimatorTransformerParamExample")
     .getOrCreate();
   Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
   // $example on$
   // Prepare training data.
   List<Row> dataTraining = Arrays.asList(
       RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
       RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
       RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
   );
   StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("features", new VectorUDT(), false, Metadata.empty())
   });
   Dataset<Row> training = spark.createDataFrame(dataTraining, schema);

   // Create a LogisticRegression instance. This instance is an Estimator.
   LogisticRegression lr = new LogisticRegression();
   // Print out the parameters, documentation, and any default values.
   System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");

   // We may set parameters using setter methods.
   lr.setMaxIter(10).setRegParam(0.01);

   // Learn a LogisticRegression model. This uses the parameters stored in lr.
   LogisticRegressionModel model1 = lr.fit(training);
   // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
   // we can view the parameters it used during fit().
   // This prints the parameter (name: value) pairs, where names are unique IDs for this
   // LogisticRegression instance.
   System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

   // We may alternatively specify parameters using a ParamMap.
   ParamMap paramMap = new ParamMap()
     .put(lr.maxIter().w(20))  // Specify 1 Param.
     .put(lr.maxIter(), 30)  // This overwrites the original maxIter.
     .put(lr.regParam().w(0.1), lr.threshold().w(0.55));  // Specify multiple Params.

   // One can also combine ParamMaps.
   ParamMap paramMap2 = new ParamMap()
     .put(lr.probabilityCol().w("myProbability"));  // Change output column name
   ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);

   // Now learn a new model using the paramMapCombined parameters.
   // paramMapCombined overrides all parameters set earlier via lr.set* methods.
   LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
   System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

   // Prepare test documents.
   List<Row> dataTest = Arrays.asList(
       RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
       RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
       RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
   );
   Dataset<Row> test = spark.createDataFrame(dataTest, schema);

   // Make predictions on test documents using the Transformer.transform() method.
   // LogisticRegression.transform will only use the 'features' column.
   // Note that model2.transform() outputs a 'myProbability' column instead of the usual
   // 'probability' column since we renamed the lr.probabilityCol parameter previously.
   Dataset<Row> results = model2.transform(test);
   Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
   for (Row r: rows.collectAsList()) {
     System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
       + ", prediction=" + r.get(3));
   }
   // $example off$

   spark.stop();
 }

Source File: JavaOneVsRestExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaOneVsRestExample")
    .getOrCreate();

  // $example on$
  // load data file.
  Dataset<Row> inputData = spark.read().format("libsvm")
    .load("data/mllib/sample_multiclass_classification_data.txt");

  // generate the train/test split.
  Dataset<Row>[] tmp = inputData.randomSplit(new double[]{0.8, 0.2});
  Dataset<Row> train = tmp[0];
  Dataset<Row> test = tmp[1];

  // configure the base classifier.
  LogisticRegression classifier = new LogisticRegression()
    .setMaxIter(10)
    .setTol(1E-6)
    .setFitIntercept(true);

  // instantiate the One Vs Rest Classifier.
  OneVsRest ovr = new OneVsRest().setClassifier(classifier);

  // train the multiclass model.
  OneVsRestModel ovrModel = ovr.fit(train);

  // score the model on test data.
  Dataset<Row> predictions = ovrModel.transform(test)
    .select("prediction", "label");

  // obtain evaluator.
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
          .setMetricName("accuracy");

  // compute the classification error on test data.
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1 - accuracy));
  // $example off$

  spark.stop();
}

Source File: JavaEstimatorTransformerParamExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaEstimatorTransformerParamExample")
    .getOrCreate();

  // $example on$
  // Prepare training data.
  List<Row> dataTraining = Arrays.asList(
      RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
      RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
      RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
      RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> training = spark.createDataFrame(dataTraining, schema);

  // Create a LogisticRegression instance. This instance is an Estimator.
  LogisticRegression lr = new LogisticRegression();
  // Print out the parameters, documentation, and any default values.
  System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");

  // We may set parameters using setter methods.
  lr.setMaxIter(10).setRegParam(0.01);

  // Learn a LogisticRegression model. This uses the parameters stored in lr.
  LogisticRegressionModel model1 = lr.fit(training);
  // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
  // we can view the parameters it used during fit().
  // This prints the parameter (name: value) pairs, where names are unique IDs for this
  // LogisticRegression instance.
  System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

  // We may alternatively specify parameters using a ParamMap.
  ParamMap paramMap = new ParamMap()
    .put(lr.maxIter().w(20))  // Specify 1 Param.
    .put(lr.maxIter(), 30)  // This overwrites the original maxIter.
    .put(lr.regParam().w(0.1), lr.threshold().w(0.55));  // Specify multiple Params.

  // One can also combine ParamMaps.
  ParamMap paramMap2 = new ParamMap()
    .put(lr.probabilityCol().w("myProbability"));  // Change output column name
  ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);

  // Now learn a new model using the paramMapCombined parameters.
  // paramMapCombined overrides all parameters set earlier via lr.set* methods.
  LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
  System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

  // Prepare test documents.
  List<Row> dataTest = Arrays.asList(
      RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
      RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
      RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
  );
  Dataset<Row> test = spark.createDataFrame(dataTest, schema);

  // Make predictions on test documents using the Transformer.transform() method.
  // LogisticRegression.transform will only use the 'features' column.
  // Note that model2.transform() outputs a 'myProbability' column instead of the usual
  // 'probability' column since we renamed the lr.probabilityCol parameter previously.
  Dataset<Row> results = model2.transform(test);
  Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
  for (Row r: rows.collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}

Source File: JavaModelSelectionViaCrossValidationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaModelSelectionViaCrossValidationExample")
    .getOrCreate();

  // $example on$
  // Prepare training documents, which are labeled.
  Dataset<Row> training = spark.createDataFrame(Arrays.asList(
    new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
    new JavaLabeledDocument(1L, "b d", 0.0),
    new JavaLabeledDocument(2L,"spark f g h", 1.0),
    new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0),
    new JavaLabeledDocument(4L, "b spark who", 1.0),
    new JavaLabeledDocument(5L, "g d a y", 0.0),
    new JavaLabeledDocument(6L, "spark fly", 1.0),
    new JavaLabeledDocument(7L, "was mapreduce", 0.0),
    new JavaLabeledDocument(8L, "e spark program", 1.0),
    new JavaLabeledDocument(9L, "a e c l", 0.0),
    new JavaLabeledDocument(10L, "spark compile", 1.0),
    new JavaLabeledDocument(11L, "hadoop software", 0.0)
  ), JavaLabeledDocument.class);

  // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
  Tokenizer tokenizer = new Tokenizer()
    .setInputCol("text")
    .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol())
    .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.01);
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  // We use a ParamGridBuilder to construct a grid of parameters to search over.
  // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
  // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
  ParamMap[] paramGrid = new ParamGridBuilder()
    .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000})
    .addGrid(lr.regParam(), new double[] {0.1, 0.01})
    .build();

  // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
  // This will allow us to jointly choose parameters for all Pipeline stages.
  // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
  // Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
  // is areaUnderROC.
  CrossValidator cv = new CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(new BinaryClassificationEvaluator())
    .setEstimatorParamMaps(paramGrid).setNumFolds(2);  // Use 3+ in practice

  // Run cross-validation, and choose the best set of parameters.
  CrossValidatorModel cvModel = cv.fit(training);

  // Prepare test documents, which are unlabeled.
  Dataset<Row> test = spark.createDataFrame(Arrays.asList(
    new JavaDocument(4L, "spark i j k"),
    new JavaDocument(5L, "l m n"),
    new JavaDocument(6L, "mapreduce spark"),
    new JavaDocument(7L, "apache hadoop")
  ), JavaDocument.class);

  // Make predictions on test documents. cvModel uses the best model found (lrModel).
  Dataset<Row> predictions = cvModel.transform(test);
  for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}

Source File: JavaPipelineExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPipelineExample")
    .getOrCreate();

  // $example on$
  // Prepare training documents, which are labeled.
  Dataset<Row> training = spark.createDataFrame(Arrays.asList(
    new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
    new JavaLabeledDocument(1L, "b d", 0.0),
    new JavaLabeledDocument(2L, "spark f g h", 1.0),
    new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0)
  ), JavaLabeledDocument.class);

  // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
  Tokenizer tokenizer = new Tokenizer()
    .setInputCol("text")
    .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol())
    .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.001);
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  // Fit the pipeline to training documents.
  PipelineModel model = pipeline.fit(training);

  // Prepare test documents, which are unlabeled.
  Dataset<Row> test = spark.createDataFrame(Arrays.asList(
    new JavaDocument(4L, "spark i j k"),
    new JavaDocument(5L, "l m n"),
    new JavaDocument(6L, "spark hadoop spark"),
    new JavaDocument(7L, "apache hadoop")
  ), JavaDocument.class);

  // Make predictions on test documents.
  Dataset<Row> predictions = model.transform(test);
  for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}

Source File: DatasetClassifier.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] path to parquet file, args[1] name of classification column
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	if (args.length != 2) {
		System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>");
		System.exit(1);
	}

	// name of the class label
	String label = args[1];
	
	long start = System.nanoTime();

	SparkSession spark = SparkSession
			.builder()
			.master("local[*]")
			.appName(DatasetClassifier.class.getSimpleName())
			.getOrCreate();

	Dataset<Row> data = spark.read().parquet(args[0]).cache();
	
	int featureCount = 0;
	Object vector = data.first().getAs("features");
	if (vector instanceof DenseVector) {
	   featureCount = ((DenseVector)vector).numActives();
	} else if (vector instanceof SparseVector) {
	   featureCount = ((SparseVector)vector).numActives();
	}
	
	System.out.println("Feature count            : "  + featureCount);
	
	int classCount = (int)data.select(label).distinct().count();
	System.out.println("Class count              : " + classCount);

	System.out.println("Dataset size (unbalanced): " + data.count());
	data.groupBy(label).count().show(classCount);

	data = DatasetBalancer.downsample(data, label, 1);
	
	System.out.println("Dataset size (balanced)  : " + data.count());
	data.groupBy(label).count().show(classCount);

	double testFraction = 0.3;
	long seed = 123;

	SparkMultiClassClassifier mcc;
	Map<String, String> metrics;

	DecisionTreeClassifier dtc = new DecisionTreeClassifier();
	mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	RandomForestClassifier rfc = new RandomForestClassifier();
	mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	LogisticRegression lr = new LogisticRegression();
	mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	// specify layers for the neural network
	//    input layer: dimension of feature vector
	//    output layer: number of classes
	int[] layers = new int[] {featureCount, 10, classCount};
	MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier()
			.setLayers(layers)
			.setBlockSize(128)
			.setSeed(1234L)
			.setMaxIter(200);

	mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
}

org.apache.spark.ml.classification.LogisticRegression Java Examples