org.apache.spark.mllib.regression.LabeledPoint Java Examples

The following examples show how to use org.apache.spark.mllib.regression.LabeledPoint. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LogisticRegressionExporterTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), EPSILON);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON);

}
 
Example #2
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a labeled point of the writables
 * where the final item is the point and the rest of the items are
 * features
 * @param writables the writables
 * @return the labeled point
 */
public static LabeledPoint pointOf(Collection<Writable> writables) {
    double[] ret = new double[writables.size() - 1];
    int count = 0;
    double target = 0;
    for (Writable w : writables) {
        if (count < writables.size() - 1)
            ret[count++] = Float.parseFloat(w.toString());
        else
            target = Float.parseFloat(w.toString());
    }

    if (target < 0)
        throw new IllegalStateException("Target must be >= 0");
    return new LabeledPoint(target, Vectors.dense(ret));
}
 
Example #3
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a traditional sc.binaryFiles
 * in to something usable for machine learning
 * @param binaryFiles the binary files to convert
 * @param reader the reader to use
 * @return the labeled points based on the given rdd
 */
public static JavaRDD<LabeledPoint> fromBinary(JavaPairRDD<String, PortableDataStream> binaryFiles,
                final RecordReader reader) {
    JavaRDD<Collection<Writable>> records =
                    binaryFiles.map(new Function<Tuple2<String, PortableDataStream>, Collection<Writable>>() {
                        @Override
                        public Collection<Writable> call(
                                        Tuple2<String, PortableDataStream> stringPortableDataStreamTuple2)
                                        throws Exception {
                            reader.initialize(new InputStreamInputSplit(stringPortableDataStreamTuple2._2().open(),
                                            stringPortableDataStreamTuple2._1()));
                            return reader.next();
                        }
                    });

    JavaRDD<LabeledPoint> ret = records.map(new Function<Collection<Writable>, LabeledPoint>() {
        @Override
        public LabeledPoint call(Collection<Writable> writables) throws Exception {
            return pointOf(writables);
        }
    });
    return ret;
}
 
Example #4
Source File: RDFUpdate.java    From oryx with Apache License 2.0 6 votes vote down vote up
/**
 * @param trainPointData data to run down trees
 * @param model random decision forest model to count on
 * @return map of predictor index to the number of training examples that reached a
 *  node whose decision is based on that feature. The index is among predictors, not all
 *  features, since there are fewer predictors than features. That is, the index will
 *  match the one used in the {@link RandomForestModel}.
 */
private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
                                                     RandomForestModel model) {
  return trainPointData.mapPartitions(data -> {
      IntLongHashMap featureIndexCount = new IntLongHashMap();
      data.forEachRemaining(datum -> {
        double[] featureVector = datum.features().toArray();
        for (DecisionTreeModel tree : model.trees()) {
          org.apache.spark.mllib.tree.model.Node node = tree.topNode();
          // This logic cloned from Node.predict:
          while (!node.isLeaf()) {
            Split split = node.split().get();
            int featureIndex = split.feature();
            // Count feature
            featureIndexCount.addToValue(featureIndex, 1);
            node = nextNode(featureVector, node, split, featureIndex);
          }
        }
      });
      return Collections.singleton(featureIndexCount).iterator();
  }).reduce(RDFUpdate::merge);
}
 
Example #5
Source File: LogisticRegressionExporterTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), 0.01);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], 0.01);

}
 
Example #6
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert an rdd of data set in to labeled point.
 * @param data the dataset to convert
 * @param preCache boolean pre-cache rdd before operation
 * @return an rdd of labeled point
 */
public static JavaRDD<LabeledPoint> fromDataSet(JavaRDD<DataSet> data, boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<DataSet, LabeledPoint>() {
        @Override
        public LabeledPoint call(DataSet dataSet) {
            return toLabeledPoint(dataSet);
        }
    });
}
 
Example #7
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFromSvmLight() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    MultiLayerConfiguration conf =
                    new NeuralNetConfiguration.Builder().seed(123)
                                    .updater(new Adam(1e-6))
                            .weightInit(WeightInit.XAVIER)
                            .list()
                            .layer(new BatchNormalization.Builder().nIn(4).nOut(4).build())
                            .layer(new DenseLayer.Builder().nIn(4).nOut(32).activation(Activation.RELU).build())
                            .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(32).nOut(3)
                                                                    .activation(Activation.SOFTMAX).build())
                                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");
    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(),
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    master.fitLabeledPoint(data);
}
 
Example #8
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFromSvmLightBackprop() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    DataSet d = new IrisDataSetIterator(150, 150).next();
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123)
                    .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
                    .layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER)
                                    .activation(Activation.RELU).build())
                    .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                    LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3)
                                                    .activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER)
                                                    .build())
                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");

    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf,
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    MultiLayerNetwork network2 = master.fitLabeledPoint(data);
}
 
Example #9
Source File: MLLIbUtilTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testMlLibTest() {
    DataSet dataSet = new IrisDataSetIterator(150, 150).next();
    List<DataSet> list = dataSet.asList();
    JavaRDD<DataSet> data = sc.parallelize(list);
    JavaRDD<LabeledPoint> mllLibData = MLLibUtil.fromDataSet(sc, data);
}
 
Example #10
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 *
 * @param point
 * @param numPossibleLabels
 * @return {@link DataSet}
 */
private static DataSet fromLabeledPoint(LabeledPoint point, long numPossibleLabels) {
    Vector features = point.features();
    double label = point.label();

    // FIXMEL int cast
    double[] fArr = features.toArray();
    return new DataSet(Nd4j.create(fArr, new long[]{1,fArr.length}),
                    FeatureUtil.toOutcomeVector((int) label, (int) numPossibleLabels));
}
 
Example #11
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 *
 * @param labeledPoints
 * @param numPossibleLabels
 * @return List of {@link DataSet}
 */
private static List<DataSet> fromLabeledPoint(List<LabeledPoint> labeledPoints, long numPossibleLabels) {
    List<DataSet> ret = new ArrayList<>();
    for (LabeledPoint point : labeledPoints) {
        ret.add(fromLabeledPoint(point, numPossibleLabels));
    }
    return ret;
}
 
Example #12
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a list of dataset in to a list of labeled points
 * @param labeledPoints the labeled points to convert
 * @return the labeled point list
 */
private static List<LabeledPoint> toLabeledPoint(List<DataSet> labeledPoints) {
    List<LabeledPoint> ret = new ArrayList<>();
    for (DataSet point : labeledPoints) {
        ret.add(toLabeledPoint(point));
    }
    return ret;
}
 
Example #13
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Converts a continuous JavaRDD LabeledPoint to a JavaRDD DataSet.
 * @param data JavaRdd LabeledPoint
 * @param preCache boolean pre-cache rdd before operation
 * @return
 */
public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaRDD<LabeledPoint> data, boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return convertToDataset(lp);
        }
    });
}
 
Example #14
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert an rdd of data set in to labeled point
 * @param sc the spark context to use
 * @param data the dataset to convert
 * @return an rdd of labeled point
 * @deprecated Use {@link #fromDataSet(JavaRDD)}
 *
 */
@Deprecated
public static JavaRDD<LabeledPoint> fromDataSet(JavaSparkContext sc, JavaRDD<DataSet> data) {

    return data.map(new Function<DataSet, LabeledPoint>() {
        @Override
        public LabeledPoint call(DataSet pt) {
            return toLabeledPoint(pt);
        }
    });
}
 
Example #15
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert rdd labeled points to a rdd dataset with continuous features
 * @param data the java rdd labeled points ready to convert
 * @return a JavaRDD<Dataset> with a continuous label
 * @deprecated Use {@link #fromContinuousLabeledPoint(JavaRDD)}
 */
@Deprecated
public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaSparkContext sc, JavaRDD<LabeledPoint> data) {

    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return convertToDataset(lp);
        }
    });
}
 
Example #16
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a dataset (feature vector) to a labeled point
 * @param point the point to convert
 * @return the labeled point derived from this dataset
 */
private static LabeledPoint toLabeledPoint(DataSet point) {
    if (!point.getFeatures().isVector()) {
        throw new IllegalArgumentException("Feature matrix must be a vector");
    }

    Vector features = toVector(point.getFeatures().dup());

    double label = Nd4j.getBlasWrapper().iamax(point.getLabels());
    return new LabeledPoint(label, features);
}
 
Example #17
Source File: JavaNaiveBayesExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}
 
Example #18
Source File: JavaLogisticRegressionWithLBFGSExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}
 
Example #19
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Converts JavaRDD labeled points to JavaRDD DataSets.
 * @param data JavaRDD LabeledPoints
 * @param numPossibleLabels number of possible labels
 * @param preCache boolean pre-cache rdd before operation
 * @return
 */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels,
                boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return fromLabeledPoint(lp, numPossibleLabels);
        }
    });
}
 
Example #20
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert an rdd
 * of labeled point
 * based on the specified batch size
 * in to data set
 * @param data the data to convert
 * @param numPossibleLabels the number of possible labels
 * @param batchSize the batch size
 * @return the new rdd
 */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels,
                long batchSize) {

    JavaRDD<DataSet> mappedData = data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return fromLabeledPoint(lp, numPossibleLabels);
        }
    });

    return mappedData.repartition((int) (mappedData.count() / batchSize));
}
 
Example #21
Source File: RDFUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param trainPointData data to run down trees
 * @param model random decision forest model to count on
 * @return maps of node IDs to the count of training examples that reached that node, one
 *  per tree in the model
 * @see #predictorExampleCounts(JavaRDD,RandomForestModel)
 */
private static List<IntLongHashMap> treeNodeExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
                                                          RandomForestModel model) {
  return trainPointData.mapPartitions(data -> {
      DecisionTreeModel[] trees = model.trees();
      List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length).
          mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList());
      data.forEachRemaining(datum -> {
        double[] featureVector = datum.features().toArray();
        for (int i = 0; i < trees.length; i++) {
          DecisionTreeModel tree = trees[i];
          IntLongHashMap nodeIDCount = treeNodeIDCounts.get(i);
          org.apache.spark.mllib.tree.model.Node node = tree.topNode();
          // This logic cloned from Node.predict:
          while (!node.isLeaf()) {
            // Count node ID
            nodeIDCount.addToValue(node.id(), 1);
            Split split = node.split().get();
            int featureIndex = split.feature();
            node = nextNode(featureVector, node, split, featureIndex);
          }
          nodeIDCount.addToValue(node.id(), 1);
        }
      });
      return Collections.singleton(treeNodeIDCounts).iterator();
    }
  ).reduce((a, b) -> {
      Preconditions.checkArgument(a.size() == b.size());
      for (int i = 0; i < a.size(); i++) {
        merge(a.get(i), b.get(i));
      }
      return a;
    });
}
 
Example #22
Source File: RDFUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private JavaRDD<LabeledPoint> parseToLabeledPointRDD(
    JavaRDD<String[]> parsedRDD,
    CategoricalValueEncodings categoricalValueEncodings) {

  return parsedRDD.map(data -> {
    try {
      double[] features = new double[inputSchema.getNumPredictors()];
      double target = Double.NaN;
      for (int featureIndex = 0; featureIndex < data.length; featureIndex++) {
        double encoded;
        if (inputSchema.isNumeric(featureIndex)) {
          encoded = Double.parseDouble(data[featureIndex]);
        } else if (inputSchema.isCategorical(featureIndex)) {
          Map<String,Integer> valueEncoding =
              categoricalValueEncodings.getValueEncodingMap(featureIndex);
          encoded = valueEncoding.get(data[featureIndex]);
        } else {
          continue;
        }
        if (inputSchema.isTarget(featureIndex)) {
          target = encoded;
        } else {
          features[inputSchema.featureToPredictorIndex(featureIndex)] = encoded;
        }
      }
      Preconditions.checkState(!Double.isNaN(target));
      return new LabeledPoint(target, Vectors.dense(features));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}
 
Example #23
Source File: LogisticRegressionBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}
 
Example #24
Source File: LogisticRegression1BridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features().asML();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}
 
Example #25
Source File: MinMaxScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testStandardScaler() {
    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])),
            new LabeledPoint(3.0, Vectors.dense(data[3])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    //train model in spark
    MinMaxScalerModel sparkModel = new MinMaxScaler()
            .setInputCol("features")
            .setOutputCol("scaled")
            .setMin(-5)
            .setMax(5)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModel, df);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collect();
    assertCorrectness(sparkOutput, expected, transformer);
}
 
Example #26
Source File: Log1PScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomScalerDenseVector() {
    final double precomputedAns[][] = new double[3][3];
    //precompute answers
        for (int j = 0; j < 3; j++)
            for (int k = 0; k < 3; k++)
                precomputedAns[j][k] = Math.log1p(data[j][k]);

    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    for (int i = 0; i < 2; i++) {
        //train model in spark
        Log1PScaler sparkModel = new Log1PScaler()
                .setInputCol("features")
                .setOutputCol("scaledOutput");

        //Export model, import it back and get transformer
        byte[] exportedModel = ModelExporter.export(sparkModel, df);
        final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

        //compare predictions
        Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaledOutput").collect();
        assertCorrectness(sparkOutput, precomputedAns, transformer);
    }
}
 
Example #27
Source File: LogisticRegressionBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #28
Source File: LogisticRegression1BridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #29
Source File: MLSupporter.java    From DDF with Apache License 2.0 5 votes vote down vote up
/**
 * Override this to return the approriate DDF representation matching that specified in {@link ParamInfo}. The base
 * implementation simply returns the DDF.
 *
 * @param paramInfo
 * @return
 */
@SuppressWarnings("unchecked")
@Override
protected Object convertDDF(ParamInfo paramInfo) throws DDFException {
  mLog.info(">>>> Running ConvertDDF of io.ddf.spark.ml.MLSupporter");
  if (paramInfo.argMatches(RDD.class)) {
    // Yay, our target data format is an RDD!
    RDD<?> rdd = null;

    if (paramInfo.paramMatches(LabeledPoint.class)) {
      rdd = (RDD<LabeledPoint>) this.getDDF().getRepresentationHandler().get(RDD.class, LabeledPoint.class);

    } else if (paramInfo.paramMatches(Vector.class)) {
      rdd = (RDD<Vector>) this.getDDF().getRepresentationHandler().get(RDD.class, Vector.class);
    } else if (paramInfo.paramMatches(double[].class)) {
      rdd = (RDD<double[]>) this.getDDF().getRepresentationHandler().get(RDD.class, double[].class);
    } else if (paramInfo.paramMatches(io.ddf.types.Vector.class)) {
      rdd = (RDD<io.ddf.types.Vector>) this.getDDF().getRepresentationHandler()
          .get(RDD.class, io.ddf.types.Vector.class);
    } else if (paramInfo.paramMatches(TupleMatrixVector.class)) {
      rdd = (RDD<TupleMatrixVector>) this.getDDF().getRepresentationHandler().get(RDD.class, TupleMatrixVector.class);
    } else if (paramInfo.paramMatches(Rating.class)) {
      rdd = (RDD<Rating>) this.getDDF().getRepresentationHandler().get(RDD.class, Rating.class);
    }
    //      else if (paramInfo.paramMatches(TablePartition.class)) {
    //        rdd = (RDD<TablePartition>) this.getDDF().getRepresentationHandler().get(RDD.class, TablePartition.class);
    //      }
    else if (paramInfo.paramMatches(Object.class)) {
      rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class);
    }

    return rdd;
  } else {
    return super.convertDDF(paramInfo);
  }
}
 
Example #30
Source File: MLMetricsSupporter.java    From DDF with Apache License 2.0 5 votes vote down vote up
@Override
/*
 * input expected RDD[double[][]]
 * (non-Javadoc)
 * @see io.ddf.ml.AMLMetricsSupporter#roc(io.ddf.DDF, int)
 */
public RocMetric roc(DDF predictionDDF, int alpha_length) throws DDFException {

  RDD<LabeledPoint> rddLabeledPoint = (RDD<LabeledPoint>) predictionDDF.getRepresentationHandler()
      .get(RDD.class, LabeledPoint.class);
  ROCComputer rc = new ROCComputer();

  return (rc.ROC(rddLabeledPoint, alpha_length));
}