org.apache.spark.ml.feature.IndexToString Java Examples

The following examples show how to use org.apache.spark.ml.feature.IndexToString. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: IndexToStringConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0

5 votes

@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	IndexToString transformer = getTransformer();

	DataField dataField = encoder.createDataField(formatName(transformer), OpType.CATEGORICAL, DataType.STRING, Arrays.asList(transformer.getLabels()));

	return Collections.singletonList(new CategoricalFeature(encoder, dataField));
}

Example #2

Source File: SparkMultiClassClassifier.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String,String> fit(Dataset<Row> data) {
	int classCount = (int)data.select(label).distinct().count();

	StringIndexerModel labelIndexer = new StringIndexer()
	  .setInputCol(label)
	  .setOutputCol("indexedLabel")
	  .fit(data);

	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed);
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];
	
	String[] labels = labelIndexer.labels();
	
	System.out.println();
	System.out.println("Class\tTrain\tTest");
	for (String l: labels) {
		System.out.println(l + "\t" + trainingData.select(label).filter(label + " = '" + l + "'").count()
				+ "\t" 
				+ testData.select(label).filter(label + " = '" + l + "'").count());
	}
	
	// Set input columns
	predictor
	.setLabelCol("indexedLabel")
	.setFeaturesCol("features");

	// Convert indexed labels back to original labels.
	IndexToString labelConverter = new IndexToString()
	  .setInputCol("prediction")
	  .setOutputCol("predictedLabel")
	  .setLabels(labelIndexer.labels());

	// Chain indexers and forest in a Pipeline
	Pipeline pipeline = new Pipeline()
	  .setStages(new PipelineStage[] {labelIndexer, predictor, labelConverter});

	// Train model. This also runs the indexers.
	PipelineModel model = pipeline.fit(trainingData);

	// Make predictions.
	Dataset<Row> predictions = model.transform(testData).cache();
	
	// Display some sample predictions
	System.out.println();
	System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());

	predictions.sample(false, 0.1, seed).show(25);	

	predictions = predictions.withColumnRenamed(label, "stringLabel");
	predictions = predictions.withColumnRenamed("indexedLabel", label);
	
	// collect metrics
	Dataset<Row> pred = predictions.select("prediction",label);
       Map<String,String> metrics = new LinkedHashMap<>();       
       metrics.put("Method", predictor.getClass().getSimpleName());
       
       if (classCount == 2) {
       	    BinaryClassificationMetrics b = new BinaryClassificationMetrics(pred);
         	metrics.put("AUC", Float.toString((float)b.areaUnderROC()));
       }
    
       MulticlassMetrics m = new MulticlassMetrics(pred); 
       metrics.put("F", Float.toString((float)m.weightedFMeasure()));
       metrics.put("Accuracy", Float.toString((float)m.accuracy()));
       metrics.put("Precision", Float.toString((float)m.weightedPrecision()));
       metrics.put("Recall", Float.toString((float)m.weightedRecall()));
       metrics.put("False Positive Rate", Float.toString((float)m.weightedFalsePositiveRate()));
       metrics.put("True Positive Rate", Float.toString((float)m.weightedTruePositiveRate()));
       metrics.put("", "\nConfusion Matrix\n" 
           + Arrays.toString(labels) +"\n" 
       		+ m.confusionMatrix().toString());
       
       return metrics;
}

Example #3

Source File: JavaIndexToStringExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaIndexToStringExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
      "to indexed column '" + indexer.getOutputCol() + "'");
  indexed.show();

  StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
  System.out.println("StringIndexer will store labels in output column metadata: " +
      Attribute.fromStructField(inputColSchema).toString() + "\n");

  IndexToString converter = new IndexToString()
    .setInputCol("categoryIndex")
    .setOutputCol("originalCategory");
  Dataset<Row> converted = converter.transform(indexed);

  System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
      "original string column '" + converter.getOutputCol() + "' using labels in metadata");
  converted.select("id", "categoryIndex", "originalCategory").show();

  // $example off$
  spark.stop();
}

Example #4

Source File: IndexToStringConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0

4 votes

public IndexToStringConverter(IndexToString transformer){
	super(transformer);
}