Java Code Examples for org.apache.spark.ml.feature.StringIndexer

The following examples show how to use org.apache.spark.ml.feature.StringIndexer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: vn.vitk   Source File: CMM.java    License: GNU General Public License v3.0 9 votes vote down vote up
/**
 * Creates a processing pipeline.
 * @return a pipeline
 */
private Pipeline createPipeline() {
	Tokenizer tokenizer = new Tokenizer()
		.setInputCol("featureStrings")
		.setOutputCol("tokens");
	CountVectorizer countVectorizer = new CountVectorizer()
		.setInputCol("tokens")
		.setOutputCol("features")
		.setMinDF((Double)params.getOrDefault(params.getMinFF()))
		.setVocabSize((Integer)params.getOrDefault(params.getNumFeatures()));  
	StringIndexer tagIndexer = new StringIndexer()
		.setInputCol("tag")
		.setOutputCol("label");
	
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer});
	return pipeline;
}
 
Example 2
Source Project: vn.vitk   Source File: TransitionClassifier.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a processing pipeline.
 * @return a pipeline
 */
protected Pipeline createPipeline() {
	Tokenizer tokenizer = new Tokenizer()
		.setInputCol("text")
		.setOutputCol("tokens");
	CountVectorizer countVectorizer = new CountVectorizer()
		.setInputCol("tokens")
		.setOutputCol("features")
		.setMinDF((Double)params.getOrDefault(params.getMinFF()))
		.setVocabSize((Integer)params.getOrDefault(params.getNumFeatures()));  
	StringIndexer transitionIndexer = new StringIndexer()
		.setInputCol("transition")
		.setOutputCol("label");
	
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer});
	return pipeline;
}
 
Example 3
Source Project: SparkDemo   Source File: JavaOneHotEncoderExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaOneHotEncoderExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  OneHotEncoder encoder = new OneHotEncoder()
    .setInputCol("categoryIndex")
    .setOutputCol("categoryVec");

  Dataset<Row> encoded = encoder.transform(indexed);
  encoded.show();
  // $example off$

  spark.stop();
}
 
Example 4
@Test
public void testStringIndexer() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", StringType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
    Dataset<Row> dataset = spark.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    List<Row> sparkOutput = model.transform(dataset).orderBy("id").select("id", "label", "labelIndex").collectAsList();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(model.getInputCol(), (String) row.get(1));
        transformer.transform(data);
        double output = (double) data.get(model.getOutputCol());

        double indexerOutput = (output);
        assertEquals(indexerOutput, (double) row.get(2), 0.01);
    }

}
 
Example 5
@Test
public void testStringIndexer() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", StringType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model, dataset);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = model.transform(dataset).orderBy("id").select("id", "label", "labelIndex").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(model.getInputCol(), (String) row.get(1));
        transformer.transform(data);
        double indexerOutput = (double) data.get(model.getOutputCol());

        assertEquals(indexerOutput, (double) row.get(2), EPSILON);
    }

}
 
Example 6
@Test
public void testStringIndexerForDoubleColumn() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", DoubleType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, 1.0), cr(1, 2.0), cr(2, 3.0), cr(3, 1.0), cr(4, 1.0), cr(5, 3.0));
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model, dataset);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = model.transform(dataset).orderBy("id").select("id", "label", "labelIndex").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(model.getInputCol(), row.getDouble(1));
        transformer.transform(data);
        double indexerOutput = (double) data.get(model.getOutputCol());
        assertEquals(indexerOutput, row.getDouble(2), EPSILON);
    }

}
 
Example 7
@Test(expected=RuntimeException.class)
public void testStringIndexerForUnseenValues() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", DoubleType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, 1.0), cr(1, 2.0), cr(2, 3.0), cr(3, 1.0), cr(4, 1.0), cr(5, 3.0));
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model, dataset);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //unseen value
    Map<String, Object> data = new HashMap<String, Object>();
    data.put(model.getInputCol(), 7.0);
    transformer.transform(data);
}
 
Example 8
Source Project: mmtf-spark   Source File: SparkMultiClassClassifier.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String,String> fit(Dataset<Row> data) {
	int classCount = (int)data.select(label).distinct().count();

	StringIndexerModel labelIndexer = new StringIndexer()
	  .setInputCol(label)
	  .setOutputCol("indexedLabel")
	  .fit(data);

	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed);
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];
	
	String[] labels = labelIndexer.labels();
	
	System.out.println();
	System.out.println("Class\tTrain\tTest");
	for (String l: labels) {
		System.out.println(l + "\t" + trainingData.select(label).filter(label + " = '" + l + "'").count()
				+ "\t" 
				+ testData.select(label).filter(label + " = '" + l + "'").count());
	}
	
	// Set input columns
	predictor
	.setLabelCol("indexedLabel")
	.setFeaturesCol("features");

	// Convert indexed labels back to original labels.
	IndexToString labelConverter = new IndexToString()
	  .setInputCol("prediction")
	  .setOutputCol("predictedLabel")
	  .setLabels(labelIndexer.labels());

	// Chain indexers and forest in a Pipeline
	Pipeline pipeline = new Pipeline()
	  .setStages(new PipelineStage[] {labelIndexer, predictor, labelConverter});

	// Train model. This also runs the indexers.
	PipelineModel model = pipeline.fit(trainingData);

	// Make predictions.
	Dataset<Row> predictions = model.transform(testData).cache();
	
	// Display some sample predictions
	System.out.println();
	System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());

	predictions.sample(false, 0.1, seed).show(25);	

	predictions = predictions.withColumnRenamed(label, "stringLabel");
	predictions = predictions.withColumnRenamed("indexedLabel", label);
	
	// collect metrics
	Dataset<Row> pred = predictions.select("prediction",label);
       Map<String,String> metrics = new LinkedHashMap<>();       
       metrics.put("Method", predictor.getClass().getSimpleName());
       
       if (classCount == 2) {
       	    BinaryClassificationMetrics b = new BinaryClassificationMetrics(pred);
         	metrics.put("AUC", Float.toString((float)b.areaUnderROC()));
       }
    
       MulticlassMetrics m = new MulticlassMetrics(pred); 
       metrics.put("F", Float.toString((float)m.weightedFMeasure()));
       metrics.put("Accuracy", Float.toString((float)m.accuracy()));
       metrics.put("Precision", Float.toString((float)m.weightedPrecision()));
       metrics.put("Recall", Float.toString((float)m.weightedRecall()));
       metrics.put("False Positive Rate", Float.toString((float)m.weightedFalsePositiveRate()));
       metrics.put("True Positive Rate", Float.toString((float)m.weightedTruePositiveRate()));
       metrics.put("", "\nConfusion Matrix\n" 
           + Arrays.toString(labels) +"\n" 
       		+ m.confusionMatrix().toString());
       
       return metrics;
}
 
Example 9
Source Project: SparkDemo   Source File: JavaStringIndexerExample.java    License: MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStringIndexerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );
  StructType schema = new StructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("category", StringType, false)
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexer indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex");

  Dataset<Row> indexed = indexer.fit(df).transform(df);
  indexed.show();
  // $example off$

  spark.stop();
}
 
Example 10
Source Project: SparkDemo   Source File: JavaIndexToStringExample.java    License: MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaIndexToStringExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
      "to indexed column '" + indexer.getOutputCol() + "'");
  indexed.show();

  StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
  System.out.println("StringIndexer will store labels in output column metadata: " +
      Attribute.fromStructField(inputColSchema).toString() + "\n");

  IndexToString converter = new IndexToString()
    .setInputCol("categoryIndex")
    .setOutputCol("originalCategory");
  Dataset<Row> converted = converter.transform(indexed);

  System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
      "original string column '" + converter.getOutputCol() + "' using labels in metadata");
  converted.select("id", "categoryIndex", "originalCategory").show();

  // $example off$
  spark.stop();
}
 
Example 11
@Test
  public void testDecisionTreeRegressionPrediction() {
      // Load the data stored in LIBSVM format as a DataFrame.
  	String datapath = "src/test/resources/regression_test.libsvm";
  	
  	Dataset<Row> data = spark.read().format("libsvm").load(datapath);


      // Split the data into training and test sets (30% held out for testing)
      Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
      Dataset<Row> trainingData = splits[0];
      Dataset<Row> testData = splits[1];

      StringIndexer indexer = new StringIndexer()
              .setInputCol("label")
              .setOutputCol("labelIndex").setHandleInvalid("skip");
      
DecisionTreeRegressor regressionModel =
        new DecisionTreeRegressor().setLabelCol("labelIndex").setFeaturesCol("features");

Pipeline pipeline = new Pipeline()
              .setStages(new PipelineStage[]{indexer, regressionModel});

PipelineModel sparkPipeline = pipeline.fit(trainingData);

      byte[] exportedModel = ModelExporter.export(sparkPipeline);

      Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
      List<Row> output = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList();

      //compare predictions
      for (Row row : output) {
      	Map<String, Object> data_ = new HashMap<>();
          data_.put("features", ((SparseVector) row.get(0)).toArray());
          data_.put("label", (row.get(2)).toString());
          transformer.transform(data_);
          System.out.println(data_);
          System.out.println(data_.get("prediction"));
          assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON);
      }
  }
 
Example 12
@Test
public void testGradientBoostClassification() {
	// Load the data stored in LIBSVM format as a DataFrame.
	String datapath = "src/test/resources/binary_classification_test.libsvm";

	Dataset<Row> data = spark.read().format("libsvm").load(datapath);
	StringIndexer indexer = new StringIndexer()
               .setInputCol("label")
               .setOutputCol("labelIndex");
	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];

	// Train a RandomForest model.
	GBTClassifier classificationModel = new GBTClassifier().setLabelCol("labelIndex")
               .setFeaturesCol("features");;

        Pipeline pipeline = new Pipeline()
                .setStages(new PipelineStage[]{indexer, classificationModel});


	 PipelineModel sparkPipeline = pipeline.fit(trainingData);

	// Export this model
	byte[] exportedModel = ModelExporter.export(sparkPipeline);

	// Import and get Transformer
	Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

	List<Row> sparkOutput = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList();
	
	// compare predictions
	for (Row row : sparkOutput) {
		Map<String, Object> data_ = new HashMap<>();
		data_.put("features", ((SparseVector) row.get(0)).toArray());
		data_.put("label", (row.get(2)).toString());
		transformer.transform(data_);
		System.out.println(data_);
		System.out.println(data_.get("prediction")+" ,"+row.get(1));
		assertEquals((double) data_.get("prediction"), (double) row.get(1), EPSILON);
	}

}
 
Example 13
@Test
public void testDecisionTreeClassificationWithPipeline() {
	

    // Load the data stored in LIBSVM format as a DataFrame.
	String datapath = "src/test/resources/classification_test.libsvm";
	Dataset<Row> data = spark.read().format("libsvm").load(datapath);



    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});        

    Dataset<Row> trainingData = splits[0];
    Dataset<Row> testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classificationModel});


    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList();

    //compare predictions
    for (Row row : output) {
    	Map<String, Object> data_ = new HashMap<>();
    	double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray();
        data_.put("features", ((SparseVector) row.get(0)).toArray());
        data_.put("label", (row.get(1)).toString());
        transformer.transform(data_);
        System.out.println(data_);
        System.out.println(data_.get("prediction"));
        assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON);
        assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON);
    }
}
 
Example 14
@Test
public void testOneHotEncoding() {
    // prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, "a"),
            RowFactory.create(1d, "b"),
            RowFactory.create(2d, "c"),
            RowFactory.create(3d, "a"),
            RowFactory.create(4d, "a"),
            RowFactory.create(5d, "c")
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("category", DataTypes.StringType, false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    StringIndexerModel indexer = new StringIndexer()
            .setInputCol("category")
            .setOutputCol("categoryIndex")
            .fit(df);
    DataFrame indexed = indexer.transform(df);

    OneHotEncoder sparkModel = new OneHotEncoder()
            .setInputCol("categoryIndex")
            .setOutputCol("categoryVec");

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, indexed);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(indexed).orderBy("id").select("id", "categoryIndex", "categoryVec").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), row.getDouble(1));
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) row.get(2)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
    }
}
 
Example 15
@Test
public void testCustomOneHotEncoding() {
    // prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, "a"),
            RowFactory.create(1d, "b"),
            RowFactory.create(2d, "c"),
            RowFactory.create(3d, "a"),
            RowFactory.create(4d, "a"),
            RowFactory.create(5d, "c")
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("category", DataTypes.StringType, false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    StringIndexerModel indexer = new StringIndexer()
            .setInputCol("category")
            .setOutputCol("categoryIndex")
            .fit(df);
    DataFrame indexed = indexer.transform(df);

    CustomOneHotEncoderModel sparkModel = new CustomOneHotEncoder()
            .setInputCol("categoryIndex")
            .setOutputCol("categoryVec")
            .fit(indexed);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, indexed);

    //Create spark's OneHotEncoder
    OneHotEncoder sparkOneHotModel = new OneHotEncoder()
            .setInputCol("categoryIndex")
            .setOutputCol("categoryVec");

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(indexed).orderBy("id").select("id", "categoryIndex", "categoryVec").collect();
    Row[] sparkOneHotOutput = sparkOneHotModel.transform(indexed).orderBy("id").select("id", "categoryIndex", "categoryVec").collect();

    //Compare Spark's OneHotEncoder with CustomOneHotEncoder
    //See if the dictionary size is equal
    assertEquals(sparkOutput.length, sparkOneHotOutput.length);

    for (int i = 0; i < sparkOutput.length; i++) {
        Row row = sparkOutput[i];
        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), row.getDouble(1));
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) row.get(2)).toArray();
        //get spark's OneHotEncoder output
        double[] sparkOneHotOp = ((Vector) sparkOneHotOutput[i].get(2)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
        assertArrayEquals(sparkOneHotOp, sparkOp, EPSILON);
    }
}
 
Example 16
@Test
public void testStringIndexerForHandlingUnseenValues() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", DoubleType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, 1.0), cr(1, 2.0), cr(2, 3.0), cr(3, 1.0), cr(4, 1.0), cr(5, 3.0));
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model, dataset);

    StringIndexerModelInfo stringIndexerModelInfo = (StringIndexerModelInfo)ModelImporter.importModelInfo(exportedModel);
    stringIndexerModelInfo.setFailOnUnseenValues(false);

    //Import and get Transformer
    Transformer transformer = stringIndexerModelInfo.getTransformer();

    //unseen value
    Map<String, Object> data = new HashMap<String, Object>();
    data.put(model.getInputCol(), 7.0);
    transformer.transform(data);
    double indexerOutput = (double) data.get(model.getOutputCol());
    assertEquals(indexerOutput, 3.0, EPSILON);

    //unseen value
    data.put(model.getInputCol(), 9.0);
    transformer.transform(data);
    indexerOutput = (double) data.get(model.getOutputCol());
    assertEquals(indexerOutput, 3.0, EPSILON);

    //unseen value
    data.put(model.getInputCol(), 0.0);
    transformer.transform(data);
    indexerOutput = (double) data.get(model.getOutputCol());
    assertEquals(indexerOutput, 3.0, EPSILON);

    //seen value
    data.put(model.getInputCol(), 2.0);
    transformer.transform(data);
    indexerOutput = (double) data.get(model.getOutputCol());
    assertEquals(indexerOutput, stringIndexerModelInfo.getLabelToIndex().get("2.0"), EPSILON);
}
 
Example 17
@Test
public void testRandomForestClassification() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestClassificationModel classificationModel = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability")
            .fit(trainingData);


    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double [] actualProbability = ((Vector) row.get(3)).toArray();
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);


    }

}
 
Example 18
@Test
public void testRandomForestClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    RandomForestClassifier classifier = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability");


    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classifier});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);
        double [] actualProbability = ((Vector) row.get(4)).toArray();
        double[] actualRaw = ((Vector) row.get(3)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);
    }
}
 
Example 19
@Test
public void testDecisionTreeClassificationRawPrediction() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeClassificationModel classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setRawPredictionCol("rawPrediction")
            .setPredictionCol("prediction")
            .fit(trainingData);

    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector inp = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<>();
        inputData.put(transformer.getInputKeys().iterator().next(), inp.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);
    }
}
 
Example 20
@Test
public void testDecisionTreeClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classificationModel});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}