org.apache.spark.mllib.util.MLUtils Java Examples

The following examples show how to use org.apache.spark.mllib.util.MLUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LogisticRegressionExporterTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), EPSILON);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON);

}
 
Example #2
Source File: LogisticRegressionExporterTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), 0.01);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], 0.01);

}
 
Example #3
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFromSvmLight() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    MultiLayerConfiguration conf =
                    new NeuralNetConfiguration.Builder().seed(123)
                                    .updater(new Adam(1e-6))
                            .weightInit(WeightInit.XAVIER)
                            .list()
                            .layer(new BatchNormalization.Builder().nIn(4).nOut(4).build())
                            .layer(new DenseLayer.Builder().nIn(4).nOut(32).activation(Activation.RELU).build())
                            .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(32).nOut(3)
                                                                    .activation(Activation.SOFTMAX).build())
                                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");
    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(),
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    master.fitLabeledPoint(data);
}
 
Example #4
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFromSvmLightBackprop() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    DataSet d = new IrisDataSetIterator(150, 150).next();
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123)
                    .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
                    .layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER)
                                    .activation(Activation.RELU).build())
                    .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                    LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3)
                                                    .activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER)
                                                    .build())
                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");

    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf,
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    MultiLayerNetwork network2 = master.fitLabeledPoint(data);
}
 
Example #5
Source File: JavaNaiveBayesExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}
 
Example #6
Source File: LogisticRegression1BridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #7
Source File: LogisticRegressionBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #8
Source File: JavaLogisticRegressionWithLBFGSExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}
 
Example #9
Source File: LogisticRegression1BridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features().asML();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}
 
Example #10
Source File: LogisticRegressionBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}
 
Example #11
Source File: JavaDecisionTreeRegressionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    // $example on$
    SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    // Load and parse the data file.
    String datapath = "data/mllib/sample_libsvm_data.txt";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    // Split the data into training and test sets (30% held out for testing)
    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
    JavaRDD<LabeledPoint> trainingData = splits[0];
    JavaRDD<LabeledPoint> testData = splits[1];

    // Set parameters.
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
    String impurity = "variance";
    Integer maxDepth = 5;
    Integer maxBins = 32;

    // Train a DecisionTree model.
    final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData,
      categoricalFeaturesInfo, impurity, maxDepth, maxBins);

    // Evaluate model on test instances and compute test error
    JavaPairRDD<Double, Double> predictionAndLabel =
      testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
    Double testMSE =
      predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
        @Override
        public Double call(Tuple2<Double, Double> pl) {
          Double diff = pl._1() - pl._2();
          return diff * diff;
        }
      }).reduce(new Function2<Double, Double, Double>() {
        @Override
        public Double call(Double a, Double b) {
          return a + b;
        }
      }) / data.count();
    System.out.println("Test Mean Squared Error: " + testMSE);
    System.out.println("Learned regression tree model:\n" + model.toDebugString());

    // Save and load model
    model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
    DecisionTreeModel sameModel = DecisionTreeModel
      .load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
    // $example off$
  }
 
Example #12
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}
 
Example #13
Source File: JavaBinaryClassificationMetricsExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Java Binary Classification Metrics Example");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_binary_classification_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits =
    data.randomSplit(new double[]{0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(2)
    .run(training.rdd());

  // Clear the prediction threshold so the model will return probabilities
  model.clearThreshold();

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      @Override
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  BinaryClassificationMetrics metrics =
    new BinaryClassificationMetrics(predictionAndLabels.rdd());

  // Precision by threshold
  JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
  System.out.println("Precision by threshold: " + precision.collect());

  // Recall by threshold
  JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
  System.out.println("Recall by threshold: " + recall.collect());

  // F Score by threshold
  JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
  System.out.println("F1 Score by threshold: " + f1Score.collect());

  JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
  System.out.println("F2 Score by threshold: " + f2Score.collect());

  // Precision-recall curve
  JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
  System.out.println("Precision-recall curve: " + prc.collect());

  // Thresholds
  JavaRDD<Double> thresholds = precision.map(
    new Function<Tuple2<Object, Object>, Double>() {
      @Override
      public Double call(Tuple2<Object, Object> t) {
        return new Double(t._1().toString());
      }
    }
  );

  // ROC Curve
  JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
  System.out.println("ROC curve: " + roc.collect());

  // AUPRC
  System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());

  // AUROC
  System.out.println("Area under ROC = " + metrics.areaUnderROC());

  // Save and load model
  model.save(sc, "target/tmp/LogisticRegressionModel");
  LogisticRegressionModel.load(sc, "target/tmp/LogisticRegressionModel");
  // $example off$
}
 
Example #14
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, OutputInfo.BinaryBlockOutputInfo);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, OutputInfo.BinaryBlockOutputInfo);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}
 
Example #15
Source File: JavaDecisionTreeClassificationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    // $example on$
    SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    // Load and parse the data file.
    String datapath = "data/mllib/sample_libsvm_data.txt";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    // Split the data into training and test sets (30% held out for testing)
    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
    JavaRDD<LabeledPoint> trainingData = splits[0];
    JavaRDD<LabeledPoint> testData = splits[1];

    // Set parameters.
    //  Empty categoricalFeaturesInfo indicates all features are continuous.
    Integer numClasses = 2;
    Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
    String impurity = "gini";
    Integer maxDepth = 5;
    Integer maxBins = 32;

    // Train a DecisionTree model for classification.
    final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, numClasses,
      categoricalFeaturesInfo, impurity, maxDepth, maxBins);

    // Evaluate model on test instances and compute test error
    JavaPairRDD<Double, Double> predictionAndLabel =
      testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
        @Override
        public Tuple2<Double, Double> call(LabeledPoint p) {
          return new Tuple2<>(model.predict(p.features()), p.label());
        }
      });
    Double testErr =
      1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
        @Override
        public Boolean call(Tuple2<Double, Double> pl) {
          return !pl._1().equals(pl._2());
        }
      }).count() / testData.count();

    System.out.println("Test Error: " + testErr);
    System.out.println("Learned classification tree model:\n" + model.toDebugString());

    // Save and load model
    model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
    DecisionTreeModel sameModel = DecisionTreeModel
      .load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
    // $example off$
  }
 
Example #16
Source File: JavaGradientBoostingRegressionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
    .setAppName("JavaGradientBoostedTreesRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Regression use SquaredError by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  boostingStrategy.getTreeStrategy().setMaxDepth(5);
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / data.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression GBT model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    "target/tmp/myGradientBoostingRegressionModel");
  // $example off$

  jsc.stop();
}
 
Example #17
Source File: JavaSVMWithSGDExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
  training.cache();
  JavaRDD<LabeledPoint> test = data.subtract(training);

  // Run training algorithm to build the model.
  int numIterations = 100;
  final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations);

  // Clear the default threshold.
  model.clearThreshold();

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double score = model.predict(p.features());
        return new Tuple2<Object, Object>(score, p.label());
      }
    }
  );

  // Get evaluation metrics.
  BinaryClassificationMetrics metrics =
    new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
  double auROC = metrics.areaUnderROC();

  System.out.println("Area under ROC = " + auROC);

  // Save and load model
  model.save(sc, "target/tmp/javaSVMWithSGDModel");
  SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel");
  // $example off$

  sc.stop();
}
 
Example #18
Source File: JavaGradientBoostingClassificationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
    .setAppName("JavaGradientBoostedTreesClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);

  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Classification use LogLoss by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  boostingStrategy.getTreeStrategy().setNumClasses(2);
  boostingStrategy.getTreeStrategy().setMaxDepth(5);
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      @Override
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
      }
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification GBT model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    "target/tmp/myGradientBoostingClassificationModel");
  // $example off$

  jsc.stop();
}
 
Example #19
Source File: JavaMulticlassClassificationMetricsExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_multiclass_classification_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(3)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());

  // Confusion matrix
  Matrix confusion = metrics.confusionMatrix();
  System.out.println("Confusion matrix: \n" + confusion);

  // Overall statistics
  System.out.println("Accuracy = " + metrics.accuracy());

  // Stats by labels
  for (int i = 0; i < metrics.labels().length; i++) {
    System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(
      metrics.labels()[i]));
    System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(
      metrics.labels()[i]));
    System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(
      metrics.labels()[i]));
  }

  //Weighted stats
  System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
  System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
  System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
  System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());

  // Save and load model
  model.save(sc, "target/tmp/LogisticRegressionModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/LogisticRegressionModel");
  // $example off$
}
 
Example #20
Source File: JavaRandomForestClassificationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a RandomForest model.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Integer numClasses = 2;
  HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "gini";
  Integer maxDepth = 5;
  Integer maxBins = 32;
  Integer seed = 12345;

  final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
    seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      @Override
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
      }
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestClassificationModel");
  // $example off$

  jsc.stop();
}
 
Example #21
Source File: JavaRandomForestRegressionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Set parameters.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "variance";
  Integer maxDepth = 4;
  Integer maxBins = 32;
  Integer seed = 12345;
  // Train a RandomForest model.
  final RandomForestModel model = RandomForest.trainRegressor(trainingData,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / testData.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestRegressionModel");
  // $example off$

  jsc.stop();
}
 
Example #22
Source File: RandomForestMlib.java    From Java-Data-Science-Cookbook with MIT License 4 votes vote down vote up
public static void main(String args[]){

		SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any");
		JavaSparkContext sc = new JavaSparkContext(configuration);

		// Load and parse the data file.
		String input = "data/rf-data.txt";
		JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD();
		// Split the data into training and test sets (30% held out for testing)
		JavaRDD<LabeledPoint>[] dataSplits = data.randomSplit(new double[]{0.7, 0.3});
		JavaRDD<LabeledPoint> trainingData = dataSplits[0];
		JavaRDD<LabeledPoint> testData = dataSplits[1];

		// Train a RandomForest model.
		Integer numClasses = 2;
		HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();//  Empty categoricalFeaturesInfo indicates all features are continuous.
		Integer numTrees = 3; // Use more in practice.
		String featureSubsetStrategy = "auto"; // Let the algorithm choose.
		String impurity = "gini";
		Integer maxDepth = 5;
		Integer maxBins = 32;
		Integer seed = 12345;

		final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses,
				categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
				seed);

		// Evaluate model on test instances and compute test error
		JavaPairRDD<Double, Double> label =
				testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
					public Tuple2<Double, Double> call(LabeledPoint p) {
						return new Tuple2<Double, Double>(rfModel.predict(p.features()), p.label());
					}
				});

		Double testError =
				1.0 * label.filter(new Function<Tuple2<Double, Double>, Boolean>() {
					public Boolean call(Tuple2<Double, Double> pl) {
						return !pl._1().equals(pl._2());
					}
				}).count() / testData.count();

		System.out.println("Test Error: " + testError);
		System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
	}