org.apache.spark.ml.classification.RandomForestClassificationModel Java Examples

The following examples show how to use org.apache.spark.ml.classification.RandomForestClassificationModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaRandomForestClassifierExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRandomForestClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestClassifier rf = new RandomForestClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
  // $example off$

  spark.stop();
}
 
Example #2
Source File: EntitySalienceSpark.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    long startTime = System.currentTimeMillis();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    List<EntityInstance> entityInstances;
    try {
        entityInstances = fe.getEntityInstances(jCas, TrainingSettings.FeatureExtractor.ENTITY_SALIENCE);

        final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

        //TODO: For each model create separate implementation.
        RandomForestClassificationModel rfm = (RandomForestClassificationModel)trainingModel.stages()[2];
        for(EntityInstance ei : entityInstances) {
            Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ei, featureVectorSize);

            double label = rfm.predict(vei);
            Vector probabilities = rfm.predictProbability(vei);
            double salience = probabilities.toArray()[1];

            SalientEntity salientEntity = new SalientEntity(jCas, 0, 0);
            salientEntity.setLabel(label);
            salientEntity.setID(ei.getEntityId());
            salientEntity.setSalience(salience);
            salientEntity.addToIndexes();
        }
        long endTime = System.currentTimeMillis() - startTime;
        logger.debug("Annotating salient entities finished in {}ms.", endTime);


    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }

}
 
Example #3
Source File: RandomForestClassificationModelInfoAdapter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Override
RandomForestModelInfo getModelInfo(final RandomForestClassificationModel sparkRfModel, final DataFrame df) {
    final RandomForestModelInfo modelInfo = new RandomForestModelInfo();

    modelInfo.setNumClasses(sparkRfModel.numClasses());
    modelInfo.setNumFeatures(sparkRfModel.numFeatures());
    modelInfo.setRegression(false); //false for classification

    final List<Double> treeWeights = new ArrayList<Double>();
    for (double w : sparkRfModel.treeWeights()) {
        treeWeights.add(w);
    }
    modelInfo.setTreeWeights(treeWeights);

    final List<DecisionTreeModelInfo> decisionTrees = new ArrayList<>();
    for (DecisionTreeModel decisionTreeModel : sparkRfModel.trees()) {
        decisionTrees.add(DECISION_TREE_ADAPTER.getModelInfo((DecisionTreeClassificationModel) decisionTreeModel, df));
    }
    modelInfo.setTrees(decisionTrees);

    final Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(sparkRfModel.getFeaturesCol());
    inputKeys.add(sparkRfModel.getLabelCol());
    modelInfo.setInputKeys(inputKeys);

    final Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(sparkRfModel.getPredictionCol());
    outputKeys.add(sparkRfModel.getProbabilityCol());
    outputKeys.add(sparkRfModel.getRawPredictionCol());
    modelInfo.setProbabilityKey(sparkRfModel.getProbabilityCol());
    modelInfo.setRawPredictionKey(sparkRfModel.getRawPredictionCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example #4
Source File: RandomForestClassificationModelConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 4 votes vote down vote up
public RandomForestClassificationModelConverter(RandomForestClassificationModel model){
	super(model);
}
 
Example #5
Source File: RandomForestClassificationModelInfoAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class<RandomForestClassificationModel> getSource() {
    return RandomForestClassificationModel.class;
}
 
Example #6
Source File: RandomForestClassificationModelInfoAdapterBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testRandomForestClassification() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestClassificationModel classificationModel = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability")
            .fit(trainingData);


    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double [] actualProbability = ((Vector) row.get(3)).toArray();
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);


    }

}