Example #1
Source File:    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data ="libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestClassifier rf = new RandomForestClassifier()

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()

  // Chain indexers and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model =;

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display."predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
  // $example off$

Example #2
Source File:    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
 * @param args args[0] path to parquet file, args[1] name of classification column
 * @throws IOException 
 * @throws StructureException 
public static void main(String[] args) throws IOException {

	if (args.length != 2) {
		System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>");

	// name of the class label
	String label = args[1];
	long start = System.nanoTime();

	SparkSession spark = SparkSession

	Dataset<Row> data =[0]).cache();
	int featureCount = 0;
	Object vector = data.first().getAs("features");
	if (vector instanceof DenseVector) {
	   featureCount = ((DenseVector)vector).numActives();
	} else if (vector instanceof SparseVector) {
	   featureCount = ((SparseVector)vector).numActives();
	System.out.println("Feature count            : "  + featureCount);
	int classCount = (int);
	System.out.println("Class count              : " + classCount);

	System.out.println("Dataset size (unbalanced): " + data.count());

	data = DatasetBalancer.downsample(data, label, 1);
	System.out.println("Dataset size (balanced)  : " + data.count());

	double testFraction = 0.3;
	long seed = 123;

	SparkMultiClassClassifier mcc;
	Map<String, String> metrics;

	DecisionTreeClassifier dtc = new DecisionTreeClassifier();
	mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed);
	metrics =;

	RandomForestClassifier rfc = new RandomForestClassifier();
	mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed);
	metrics =;

	LogisticRegression lr = new LogisticRegression();
	mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed);
	metrics =;

	// specify layers for the neural network
	//    input layer: dimension of feature vector
	//    output layer: number of classes
	int[] layers = new int[] {featureCount, 10, classCount};
	MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier()

	mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed);
	metrics =;

	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
Example #3
Source File:    From spark-transformers with Apache License 2.0 4 votes vote down vote up
public void testRandomForestClassification() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data ="libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestClassificationModel classificationModel = new RandomForestClassifier()

    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double [] actualProbability = ((Vector) row.get(3)).toArray();
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);


Example #4
Source File:    From spark-transformers with Apache License 2.0 4 votes vote down vote up
public void testRandomForestClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data ="libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()

    // Train a DecisionTree model.
    RandomForestClassifier classifier = new RandomForestClassifier()

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classifier});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline =;

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);
        double [] actualProbability = ((Vector) row.get(4)).toArray();
        double[] actualRaw = ((Vector) row.get(3)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);