Java Code Examples for org.apache.spark.sql.Row#getDouble()

The following examples show how to use org.apache.spark.sql.Row#getDouble() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PopularWordsEstimatorBridgeTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
private void assertCorrectness(Dataset<Row> rowDataset, Transformer transformer) {
	List<Row> sparkOutput = rowDataset.collectAsList();
	for (Row row : sparkOutput) {
		List<Object> list = row.getList(0);
		String[] sanitizedAddress = new String[list.size()];
		for (int j = 0; j < sanitizedAddress.length; j++) {
			sanitizedAddress[j] = (String) list.get(j);
		}

		Map<String, Object> data = new HashMap<>();
		data.put("sanitizedAddress", sanitizedAddress);

		double expected = row.getDouble(1);
		transformer.transform(data);
		double actual = (double) data.get("commonFraction");

		assertEquals(expected, actual, 0.01);
	}
}
 
Example 2
Source File: SQLDouble.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public void read(Row row, int ordinal) throws StandardException {
	if (row.isNullAt(ordinal))
		setToNull();
	else {
		isNull = false;
		value = row.getDouble(ordinal);
		if (value == Double.POSITIVE_INFINITY ||
		    value == Double.NEGATIVE_INFINITY ||
		    value == Double.NaN)
		    throw StandardException.newException(SQLState.LANG_OUTSIDE_RANGE_FOR_DATATYPE, TypeId.DOUBLE_NAME);
	}
}
 
Example 3
Source File: DataFrameOps.java    From toolbox with Apache License 2.0 5 votes vote down vote up
private static double[] transformRow2DataInstance(Row row, Attributes attributes) throws Exception {

        double[] instance = new double[row.length()];

        for (int i = 0; i < row.length(); i++) {

            Attribute att = attributes.getFullListOfAttributes().get(i);
            StateSpaceType space = att.getStateSpaceType();

            switch (space.getStateSpaceTypeEnum()) {
                case REAL:
                    instance[i] = row.getDouble(i);
                    break;

                case FINITE_SET:
                    String state = row.getString(i);
                    double index = ((FiniteStateSpace) space).getIndexOfState(state);
                    instance[i] = index;
                    break;

                default:
                    // This should never execute
                    throw new Exception("Unrecognized Error");
            }
        }

        return instance;
    }
 
Example 4
Source File: DecisionTreeRegressionModelBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testDecisionTreeRegression() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeRegressionModel regressionModel = new DecisionTreeRegressor()
            .setFeaturesCol("features").fit(trainingData);

    byte[] exportedModel = ModelExporter.export(regressionModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = regressionModel.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        System.out.println(actual + ", " + predicted);
        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example 5
Source File: TestHelpers.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Object getPrimitiveValue(Row row, int ord, Type type) {
  if (row.isNullAt(ord)) {
    return null;
  }
  switch (type.typeId()) {
    case BOOLEAN:
      return row.getBoolean(ord);
    case INTEGER:
      return row.getInt(ord);
    case LONG:
      return row.getLong(ord);
    case FLOAT:
      return row.getFloat(ord);
    case DOUBLE:
      return row.getDouble(ord);
    case STRING:
      return row.getString(ord);
    case BINARY:
    case FIXED:
    case UUID:
      return row.get(ord);
    case DATE:
      return row.getDate(ord);
    case TIMESTAMP:
      return row.getTimestamp(ord);
    case DECIMAL:
      return row.getDecimal(ord);
    default:
      throw new IllegalArgumentException("Unhandled type " + type);
  }
}
 
Example 6
Source File: RandomForestRegressionModelInfoAdapterBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testRandomForestRegression() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestRegressionModel regressionModel = new RandomForestRegressor()
            .setFeaturesCol("features").fit(trainingData);

    byte[] exportedModel = ModelExporter.export(regressionModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = regressionModel.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        System.out.println(actual + ", " + predicted);
        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example 7
Source File: TestHelpers.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Object getPrimitiveValue(Row row, int ord, Type type) {
  if (row.isNullAt(ord)) {
    return null;
  }
  switch (type.typeId()) {
    case BOOLEAN:
      return row.getBoolean(ord);
    case INTEGER:
      return row.getInt(ord);
    case LONG:
      return row.getLong(ord);
    case FLOAT:
      return row.getFloat(ord);
    case DOUBLE:
      return row.getDouble(ord);
    case STRING:
      return row.getString(ord);
    case BINARY:
    case FIXED:
    case UUID:
      return row.get(ord);
    case DATE:
      return row.getDate(ord);
    case TIMESTAMP:
      return row.getTimestamp(ord);
    case DECIMAL:
      return row.getDecimal(ord);
    default:
      throw new IllegalArgumentException("Unhandled type " + type);
  }
}
 
Example 8
Source File: PipelineBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    Dataset<Row> testData = spark.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01);
    }
}
 
Example 9
Source File: AlgebraicTransformBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testAlgebraicTransform(){
    //get expected Ax + b transform for given data
    double[] axB = axBTranform(this.coeff, this.data);
    // prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create((data[0])),
            RowFactory.create((data[1])),
            RowFactory.create((data[2]))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("trueProb", DataTypes.DoubleType, false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);

    AlgebraicTransform customSparkModel = new AlgebraicTransform()
            .setInputCol("trueProb")
            .setOutputCol("scaledProb")
            .setCoefficients(coeff);

    //Export this model
    byte[] exportedModel = ModelExporter.export(customSparkModel, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] customSparkOutput = customSparkModel.transform(df).select("trueProb", "scaledProb").collect();

    for (int i = 0; i < customSparkOutput.length; i++) {
        Row row= customSparkOutput[i];
        Map<String, Object> mapData = new HashMap<String, Object>();
        mapData.put(transformer.getInputKeys().iterator().next(), row.getDouble(0));
        transformer.transform(mapData);
        double transformedOp = (double) mapData.get(transformer.getOutputKeys().iterator().next());

        double sparkOp = ((double) row.getDouble(1));
        //Check if imported model produces same result as spark output
        assertEquals(transformedOp, sparkOp, 0.000001);
        //check if spark output is correct. This also tests for correctness of AlgebraicTransform
        assertEquals(axB[i], sparkOp, 0.000001);

    }
}
 
Example 10
Source File: ProbabilityTransformBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testProbabilityTransform(){
    //get expected true probability
    double[] trueProb = getTrueProb(data, this.p1, this.r1);
    // prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create((data[0])),
            RowFactory.create((data[1])),
            RowFactory.create((data[2]))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("probability", DataTypes.DoubleType, false, Metadata.empty())
            //new StructField("probability", new VectorUDT(), false, Metadata.empty()),
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);

    ProbabilityTransformModel customSparkModel = new ProbabilityTransform()
            .setInputCol("probability")
            .setOutputCol("trueProbability")
            .setActualClickProportion(p1)
            .setUnderSampledClickProportion(r1)
            .setProbIndex(idx)
            .fit(df);

    //Export this model
    byte[] exportedModel = ModelExporter.export(customSparkModel, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] customSparkOutput = customSparkModel.transform(df).select("probability", "trueProbability").collect();

    for (int i = 0; i < customSparkOutput.length; i++) {
        Row row= customSparkOutput[i];
        Map<String, Object> mapData = new HashMap<String, Object>();
        mapData.put(transformer.getInputKeys().iterator().next(), row.getDouble(0));
        transformer.transform(mapData);
        double transformedOp = (double) mapData.get(transformer.getOutputKeys().iterator().next());

        double sparkOp = ((double) row.getDouble(1));
        //Check if imported model produces same result as spark output
        assertEquals(transformedOp, sparkOp, 0.000001);
        //check if spark output is correct. This also tests for correctness of ProbabilityTransform
        assertEquals(trueProb[i], sparkOp, 0.000001);

    }
}
 
Example 11
Source File: BucketizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void bucketizerTest() {
    double[] validData = {-0.5, -0.3, 0.0, 0.2};
    double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0};
    double[] splits = {-0.5, 0.0, 0.5};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, validData[0]),
            cr(1, validData[1]),
            cr(2, validData[2]),
            cr(3, validData[3]));

    Dataset<Row> df = spark.createDataFrame(trainingData, schema);

    Bucketizer sparkModel = new Bucketizer()
            .setInputCol("feature")
            .setOutputCol("result")
            .setSplits(splits);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    List<Row> sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collectAsList();

    for (Row r : sparkOutput) {
        double input = r.getDouble(1);
        double sparkOp = r.getDouble(2);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), input);
        transformer.transform(data);
        double transformedInput = (double) data.get(sparkModel.getOutputCol());

        assertTrue((transformedInput >= 0) && (transformedInput <= 1));
        assertEquals(transformedInput, sparkOp, 0.01);
        assertEquals(transformedInput, expectedBuckets[r.getInt(0)], 0.01);
    }
}
 
Example 12
Source File: TransitionClassifier.java    From vn.vitk with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Trains a transition classifier on the data frame.
 * @param jsc
 * @param graphs
 * @param featureFrame
 * @param classifierFileName
 * @param numHiddenUnits
 * @return a transition classifier.
 */
public Transformer trainMLP(JavaSparkContext jsc,
		List<DependencyGraph> graphs, FeatureFrame featureFrame,
		String classifierFileName, int numHiddenUnits) {
	// create a SQLContext
	this.sqlContext = new SQLContext(jsc);
	// extract a data frame from these graphs
	DataFrame dataset = toDataFrame(jsc, graphs, featureFrame);
	
	// create a processing pipeline and fit it to the data frame
	Pipeline pipeline = createPipeline();
	PipelineModel pipelineModel = pipeline.fit(dataset);
	DataFrame trainingData = pipelineModel.transform(dataset);
	
	// cache the training data for better performance
	trainingData.cache();
	
	if (verbose) {
		trainingData.show(false);
	}
	
	// compute the number of different labels, which is the maximum element 
	// in the 'label' column.
	trainingData.registerTempTable("dfTable");
	Row row = sqlContext.sql("SELECT MAX(label) as maxValue from dfTable").first();
	int numLabels = (int)row.getDouble(0);
	numLabels++;
	
	int vocabSize = ((CountVectorizerModel)(pipelineModel.stages()[1])).getVocabSize();
	
	// default is a two-layer MLP
	int[] layers = {vocabSize, numLabels};
	// if user specify a hidden layer, use a 3-layer MLP:
	if (numHiddenUnits > 0) {
		layers = new int[3];
		layers[0] = vocabSize;
		layers[1] = numHiddenUnits;
		layers[2] = numLabels;
	}
	MultilayerPerceptronClassifier classifier = new MultilayerPerceptronClassifier()
		.setLayers(layers)
		.setBlockSize(128)
		.setSeed(1234L)
		.setTol((Double)params.getOrDefault(params.getTolerance()))
		.setMaxIter((Integer)params.getOrDefault(params.getMaxIter()));
	MultilayerPerceptronClassificationModel model = classifier.fit(trainingData);
	
	// compute precision on the training data
	//
	DataFrame result = model.transform(trainingData);
	DataFrame predictionAndLabel = result.select("prediction", "label");
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	if (verbose) {
		System.out.println("N = " + trainingData.count());
		System.out.println("D = " + vocabSize);
		System.out.println("K = " + numLabels);
		System.out.println("H = " + numHiddenUnits);
		System.out.println("training precision = " + evaluator.evaluate(predictionAndLabel));
	}
	
	// save the trained MLP to a file
	//
	String classifierPath = new Path(classifierFileName, "data").toString();
	jsc.parallelize(Arrays.asList(model), 1).saveAsObjectFile(classifierPath);
	// save the pipeline model to sub-directory "pipelineModel"
	// 
	try {
		String pipelinePath = new Path(classifierFileName, "pipelineModel").toString(); 
		pipelineModel.write().overwrite().save(pipelinePath);
	} catch (IOException e) {
		e.printStackTrace();
	}
	return model;
}
 
Example 13
Source File: DecisionTreeRegressionModelBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testDecisionTreeRegressionWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeRegressor dt = new DecisionTreeRegressor()
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{dt});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example 14
Source File: BucketizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void bucketizerTest() {
    double[] validData = {-0.5, -0.3, 0.0, 0.2};
    double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0};
    double[] splits = {-0.5, 0.0, 0.5};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, validData[0]),
            cr(1, validData[1]),
            cr(2, validData[2]),
            cr(3, validData[3]));

    DataFrame df = sqlContext.createDataFrame(trainingData, schema);

    Bucketizer sparkModel = new Bucketizer()
            .setInputCol("feature")
            .setOutputCol("result")
            .setSplits(splits);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collect();

    for (Row r : sparkOutput) {
        double input = r.getDouble(1);
        double sparkOp = r.getDouble(2);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), input);
        transformer.transform(data);
        double transformedInput = (double) data.get(sparkModel.getOutputCol());

        assertTrue((transformedInput >= 0) && (transformedInput <= 1));
        assertEquals(transformedInput, sparkOp, EPSILON);
        assertEquals(transformedInput, expectedBuckets[r.getInt(0)], EPSILON);
    }
}
 
Example 15
Source File: PipelineBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    DataFrame testData = sqlContext.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON);
    }
}
 
Example 16
Source File: RandomForestClassificationModelInfoAdapterBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testRandomForestClassification() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestClassificationModel classificationModel = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability")
            .fit(trainingData);


    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double [] actualProbability = ((Vector) row.get(3)).toArray();
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);


    }

}
 
Example 17
Source File: RandomForestClassificationModelInfoAdapterBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testRandomForestClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    RandomForestClassifier classifier = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability");


    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classifier});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);
        double [] actualProbability = ((Vector) row.get(4)).toArray();
        double[] actualRaw = ((Vector) row.get(3)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);
    }
}
 
Example 18
Source File: DecisionTreeClassificationModelBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testDecisionTreeClassificationRawPrediction() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeClassificationModel classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setRawPredictionCol("rawPrediction")
            .setPredictionCol("prediction")
            .fit(trainingData);

    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector inp = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<>();
        inputData.put(transformer.getInputKeys().iterator().next(), inp.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);
    }
}
 
Example 19
Source File: DecisionTreeClassificationModelBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testDecisionTreeClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classificationModel});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example 20
Source File: AverageUDAF.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
@Override
public Object evaluate(Row row) {		
	return row.getDouble(0)/row.getDouble(1);
}