org.apache.spark.mllib.linalg.Matrix Java Examples

The following examples show how to use org.apache.spark.mllib.linalg.Matrix. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaPCAExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("PCA Example");
  SparkContext sc = new SparkContext(conf);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 principal components.
  Matrix pc = mat.computePrincipalComponents(3);
  RowMatrix projected = mat.multiply(pc);
  // $example off$
  Vector[] collectPartitions = (Vector[])projected.rows().collect();
  System.out.println("Projected vector of principal component:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
}
 
Example #2
Source File: MLLIbUtilTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
private boolean matrixEquals(Matrix mlMatrix, INDArray indMatrix, Double eps) {
    final int mlRows = mlMatrix.numRows();
    final int mlCols = mlMatrix.numCols();
    final int indRows = indMatrix.rows();
    final int indCols = indMatrix.columns();

    if (mlRows != indRows)
        return false;
    if (mlCols != indCols)
        return false;

    for (int i = 0; i < mlRows; i++) {
        for (int j = 0; j < mlCols; j++) {
            double delta = Math.abs(mlMatrix.apply(i, j) - indMatrix.getDouble(i, j));
            if (delta > eps)
                return false;
        }
    }
    return true;
}
 
Example #3
Source File: JavaSVDExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

  jsc.stop();
}
 
Example #4
Source File: JavaCorrelationsExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
      )
    );

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    System.out.println(correlMatrix.toString());
    // $example off$

    jsc.stop();
  }
 
Example #5
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Convert a local (not distributed) Spark Matrix to an Apache Commons matrix.
 *
 * @param r Never {@code null}
 * @return Not {@code null}
 */
public static RealMatrix convertSparkMatrixToRealMatrix(final Matrix r){
    final RealMatrix result = new Array2DRowRealMatrix(r.numRows(), r.numCols());
    final double [] columnMajorMat = r.toArray();
    for (int i = 0; i < r.numRows(); i++) {
        result.setRow(i, Arrays.copyOfRange(columnMajorMat, i * r.numCols(), i * r.numCols() + r.numCols()) );
    }
    return result;
}
 
Example #6
Source File: PCATangentNormalizationUtils.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Tangent normalize given the raw PoN data using Spark:  the code here is a little more complex for optimization purposes.
 *
 *  Please see notes in docs/PoN ...
 *
 *  Ahat^T = (C^T P^T) A^T
 *  Therefore, C^T is the RowMatrix
 *
 *  pinv: P
 *  panel: A
 *  projection: Ahat
 *  cases: C
 *  betahat: C^T P^T
 *  tangentNormalizedCounts: C - Ahat
 */
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts,
                                                                   final RealMatrix reducedPanelCounts,
                                                                   final RealMatrix reducedPanelPInvCounts,
                                                                   final CaseToPoNTargetMapper targetMapper,
                                                                   final RealMatrix tangentNormalizationInputCounts,
                                                                   final JavaSparkContext ctx) {
    // Make the C^T a distributed matrix (RowMatrix)
    final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(
            ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);

    // Spark local matrices (transposed)
    final Matrix pinvTLocalMat = new DenseMatrix(
            reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(),
            Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
    final Matrix panelTLocalMat = new DenseMatrix(
            reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(),
            Doubles.concat(reducedPanelCounts.getData()), true).transpose();

    // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
    final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
    final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
    final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(
            projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();

    // Subtract the projection from the cases
    final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);

    // Construct the result object and return it with the correct targets.
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(
            tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(
            tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(
            betahatDistMat, tangentNormalizedCounts.getColumnDimension());

    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
 
Example #7
Source File: SparkConverter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Convert a local (not distributed) Spark Matrix to an Apache Commons matrix.
 *
 * @param r Never {@code null}
 * @return Not {@code null}
 */
public static RealMatrix convertSparkMatrixToRealMatrix(final Matrix r){
    final RealMatrix result = new Array2DRowRealMatrix(r.numRows(), r.numCols());
    final double [] columnMajorMat = r.toArray();
    for (int i = 0; i < r.numRows(); i++) {
        result.setRow(i, Arrays.copyOfRange(columnMajorMat, i * r.numCols(), i * r.numCols() + r.numCols()) );
    }
    return result;
}
 
Example #8
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert an ndarray to a matrix.
 * Note that the matrix will be con
 * @param arr the array
 * @return an mllib vector
 */
public static Matrix toMatrix(INDArray arr) {
    if (!arr.isMatrix()) {
        throw new IllegalArgumentException("passed in array must be a matrix");
    }

    // if arr is a view - we have to dup anyway
    if (arr.isView()) {
        return Matrices.dense(arr.rows(), arr.columns(), arr.dup('f').data().asDouble());
    } else // if not a view - we must ensure data is F ordered
        return Matrices.dense(arr.rows(), arr.columns(),
                        arr.ordering() == 'f' ? arr.data().asDouble() : arr.dup('f').data().asDouble());
}
 
Example #9
Source File: MLLIbUtilTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testINDtoMLMatrix() {
    INDArray matIND = Nd4j.rand(23, 100);

    Matrix matMl = MLLibUtil.toMatrix(matIND);

    assertTrue(matrixEquals(matMl, matIND, 0.01));
}
 
Example #10
Source File: MLLIbUtilTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testMltoINDMatrix() {
    Matrix matMl = Matrices.randn(23, 100, new Random(3949955));

    INDArray matIND = MLLibUtil.toMatrix(matMl);
    log.info("matrix shape: {}", Arrays.toString(matIND.shapeInfoDataBuffer().asInt()));

    assertTrue(matrixEquals(matMl, matIND, 0.01));
}
 
Example #11
Source File: TrainingSparkRunner.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
private void multiClassEvaluation(DataFrame predictions, String output, TrainingSettings trainingSettings) throws IOException {
    FileSystem fs = FileSystem.get(new Configuration());
    Path evalPath = new Path(output+"multiclass_evaluation_"+trainingSettings.getClassificationMethod()+".txt");
    fs.delete(evalPath, true);
    FSDataOutputStream fsdos = fs.create(evalPath);

    MulticlassMetrics metrics = new MulticlassMetrics(predictions
            .select("prediction", "label"));

    // Confusion matrix
    Matrix confusion = metrics.confusionMatrix();
    IOUtils.write("\nConfusion matrix: \n" + confusion, fsdos);

    // Overall statistics
    IOUtils.write("\nPrecision = " + metrics.precision(), fsdos);
    IOUtils.write("\nRecall = " + metrics.recall(), fsdos);
    IOUtils.write("\nF1 Score = " + metrics.fMeasure(), fsdos);
    IOUtils.write("\n\n", fsdos);
    // Stats by labels
    for (int i = 0; i < metrics.labels().length; i++) {


        IOUtils.write(String.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(metrics.labels()[i])), fsdos);
        IOUtils.write(String.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])), fsdos);
        IOUtils.write(String.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])), fsdos);

        System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(metrics.labels()[i]));
        System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
        System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
    }

    //Weighted stats
    IOUtils.write("\nWeighted precision = "+metrics.weightedPrecision(), fsdos);
    IOUtils.write("\nWeighted recall = "+metrics.weightedRecall(), fsdos);
    IOUtils.write("\nWeighted F1 score ="+metrics.weightedFMeasure(), fsdos);
    IOUtils.write("\nWeighted false positive rate = " +metrics.weightedFalsePositiveRate(), fsdos);

    fsdos.flush();
    IOUtils.closeQuietly(fsdos);

}
 
Example #12
Source File: JavaMulticlassClassificationMetricsExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_multiclass_classification_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(3)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());

  // Confusion matrix
  Matrix confusion = metrics.confusionMatrix();
  System.out.println("Confusion matrix: \n" + confusion);

  // Overall statistics
  System.out.println("Accuracy = " + metrics.accuracy());

  // Stats by labels
  for (int i = 0; i < metrics.labels().length; i++) {
    System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(
      metrics.labels()[i]));
    System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(
      metrics.labels()[i]));
    System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(
      metrics.labels()[i]));
  }

  //Weighted stats
  System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
  System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
  System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
  System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());

  // Save and load model
  model.save(sc, "target/tmp/LogisticRegressionModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/LogisticRegressionModel");
  // $example off$
}
 
Example #13
Source File: JavaLatentDirichletAllocationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }
 
Example #14
Source File: JavaHypothesisTestingExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // a vector composed of the frequencies of events
    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);

    // compute the goodness of fit. If a second vector to test against is not supplied
    // as a parameter, the test runs against a uniform distribution.
    ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
    // summary of the test including the p-value, degrees of freedom, test statistic,
    // the method used, and the null hypothesis.
    System.out.println(goodnessOfFitTestResult + "\n");

    // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});

    // conduct Pearson's independence test on the input contingency matrix
    ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
    // summary of the test including the p-value, degrees of freedom...
    System.out.println(independenceTestResult + "\n");

    // an RDD of labeled points
    JavaRDD<LabeledPoint> obs = jsc.parallelize(
      Arrays.asList(
        new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
        new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
        new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
      )
    );

    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
    // against the label.
    ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
    int i = 1;
    for (ChiSqTestResult result : featureTestResults) {
      System.out.println("Column " + i + ":");
      System.out.println(result + "\n");  // summary of the test
      i++;
    }
    // $example off$

    jsc.stop();
  }
 
Example #15
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Convert an ndarray to a matrix.
 * Note that the matrix will be con
 * @param arr the array
 * @return an mllib vector
 */
public static INDArray toMatrix(Matrix arr) {

    // we assume that Matrix always has F order
    return Nd4j.create(arr.toArray(), new int[] {arr.numRows(), arr.numCols()}, 'f');
}
 
Example #16
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Predict the given feature matrix
 *
 * @param features the given feature matrix
 * @return the predictions
 */
public Matrix predict(Matrix features) {
    return MLLibUtil.toMatrix(network.output(MLLibUtil.toMatrix(features)));
}