org.apache.spark.mllib.linalg.Matrix Java Exaples

Source File: JavaPCAExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("PCA Example");
  SparkContext sc = new SparkContext(conf);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 principal components.
  Matrix pc = mat.computePrincipalComponents(3);
  RowMatrix projected = mat.multiply(pc);
  // $example off$
  Vector[] collectPartitions = (Vector[])projected.rows().collect();
  System.out.println("Projected vector of principal component:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
}

Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0

6 votes

private boolean matrixEquals(Matrix mlMatrix, INDArray indMatrix, Double eps) {
    final int mlRows = mlMatrix.numRows();
    final int mlCols = mlMatrix.numCols();
    final int indRows = indMatrix.rows();
    final int indCols = indMatrix.columns();

    if (mlRows != indRows)
        return false;
    if (mlCols != indCols)
        return false;

    for (int i = 0; i < mlRows; i++) {
        for (int j = 0; j < mlCols; j++) {
            double delta = Math.abs(mlMatrix.apply(i, j) - indMatrix.getDouble(i, j));
            if (delta > eps)
                return false;
        }
    }
    return true;
}

Source File: JavaSVDExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

  jsc.stop();
}

Source File: JavaCorrelationsExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
      )
    );

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    System.out.println(correlMatrix.toString());
    // $example off$

    jsc.stop();
  }

Source File: SparkConverter.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Convert a local (not distributed) Spark Matrix to an Apache Commons matrix.
 *
 * @param r Never {@code null}
 * @return Not {@code null}
 */
public static RealMatrix convertSparkMatrixToRealMatrix(final Matrix r){
    final RealMatrix result = new Array2DRowRealMatrix(r.numRows(), r.numCols());
    final double [] columnMajorMat = r.toArray();
    for (int i = 0; i < r.numRows(); i++) {
        result.setRow(i, Arrays.copyOfRange(columnMajorMat, i * r.numCols(), i * r.numCols() + r.numCols()) );
    }
    return result;
}

Source File: PCATangentNormalizationUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Tangent normalize given the raw PoN data using Spark:  the code here is a little more complex for optimization purposes.
 *
 *  Please see notes in docs/PoN ...
 *
 *  Ahat^T = (C^T P^T) A^T
 *  Therefore, C^T is the RowMatrix
 *
 *  pinv: P
 *  panel: A
 *  projection: Ahat
 *  cases: C
 *  betahat: C^T P^T
 *  tangentNormalizedCounts: C - Ahat
 */
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts,
                                                                   final RealMatrix reducedPanelCounts,
                                                                   final RealMatrix reducedPanelPInvCounts,
                                                                   final CaseToPoNTargetMapper targetMapper,
                                                                   final RealMatrix tangentNormalizationInputCounts,
                                                                   final JavaSparkContext ctx) {
    // Make the C^T a distributed matrix (RowMatrix)
    final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(
            ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);

    // Spark local matrices (transposed)
    final Matrix pinvTLocalMat = new DenseMatrix(
            reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(),
            Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
    final Matrix panelTLocalMat = new DenseMatrix(
            reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(),
            Doubles.concat(reducedPanelCounts.getData()), true).transpose();

    // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
    final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
    final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
    final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(
            projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();

    // Subtract the projection from the cases
    final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);

    // Construct the result object and return it with the correct targets.
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(
            tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(
            tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(
            betahatDistMat, tangentNormalizedCounts.getColumnDimension());

    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}

Source File: SparkConverter.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Convert a local (not distributed) Spark Matrix to an Apache Commons matrix.
 *
 * @param r Never {@code null}
 * @return Not {@code null}
 */
public static RealMatrix convertSparkMatrixToRealMatrix(final Matrix r){
    final RealMatrix result = new Array2DRowRealMatrix(r.numRows(), r.numCols());
    final double [] columnMajorMat = r.toArray();
    for (int i = 0; i < r.numRows(); i++) {
        result.setRow(i, Arrays.copyOfRange(columnMajorMat, i * r.numCols(), i * r.numCols() + r.numCols()) );
    }
    return result;
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert an ndarray to a matrix.
 * Note that the matrix will be con
 * @param arr the array
 * @return an mllib vector
 */
public static Matrix toMatrix(INDArray arr) {
    if (!arr.isMatrix()) {
        throw new IllegalArgumentException("passed in array must be a matrix");
    }

    // if arr is a view - we have to dup anyway
    if (arr.isView()) {
        return Matrices.dense(arr.rows(), arr.columns(), arr.dup('f').data().asDouble());
    } else // if not a view - we must ensure data is F ordered
        return Matrices.dense(arr.rows(), arr.columns(),
                        arr.ordering() == 'f' ? arr.data().asDouble() : arr.dup('f').data().asDouble());
}

Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testINDtoMLMatrix() {
    INDArray matIND = Nd4j.rand(23, 100);

    Matrix matMl = MLLibUtil.toMatrix(matIND);

    assertTrue(matrixEquals(matMl, matIND, 0.01));
}

Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testMltoINDMatrix() {
    Matrix matMl = Matrices.randn(23, 100, new Random(3949955));

    INDArray matIND = MLLibUtil.toMatrix(matMl);
    log.info("matrix shape: {}", Arrays.toString(matIND.shapeInfoDataBuffer().asInt()));

    assertTrue(matrixEquals(matMl, matIND, 0.01));
}

Source File: TrainingSparkRunner.java From ambiverse-nlu with Apache License 2.0

4 votes

private void multiClassEvaluation(DataFrame predictions, String output, TrainingSettings trainingSettings) throws IOException {
    FileSystem fs = FileSystem.get(new Configuration());
    Path evalPath = new Path(output+"multiclass_evaluation_"+trainingSettings.getClassificationMethod()+".txt");
    fs.delete(evalPath, true);
    FSDataOutputStream fsdos = fs.create(evalPath);

    MulticlassMetrics metrics = new MulticlassMetrics(predictions
            .select("prediction", "label"));

    // Confusion matrix
    Matrix confusion = metrics.confusionMatrix();
    IOUtils.write("\nConfusion matrix: \n" + confusion, fsdos);

    // Overall statistics
    IOUtils.write("\nPrecision = " + metrics.precision(), fsdos);
    IOUtils.write("\nRecall = " + metrics.recall(), fsdos);
    IOUtils.write("\nF1 Score = " + metrics.fMeasure(), fsdos);
    IOUtils.write("\n\n", fsdos);
    // Stats by labels
    for (int i = 0; i < metrics.labels().length; i++) {


        IOUtils.write(String.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(metrics.labels()[i])), fsdos);
        IOUtils.write(String.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])), fsdos);
        IOUtils.write(String.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])), fsdos);

        System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(metrics.labels()[i]));
        System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
        System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
    }

    //Weighted stats
    IOUtils.write("\nWeighted precision = "+metrics.weightedPrecision(), fsdos);
    IOUtils.write("\nWeighted recall = "+metrics.weightedRecall(), fsdos);
    IOUtils.write("\nWeighted F1 score ="+metrics.weightedFMeasure(), fsdos);
    IOUtils.write("\nWeighted false positive rate = " +metrics.weightedFalsePositiveRate(), fsdos);

    fsdos.flush();
    IOUtils.closeQuietly(fsdos);

}

Source File: JavaMulticlassClassificationMetricsExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_multiclass_classification_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(3)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());

  // Confusion matrix
  Matrix confusion = metrics.confusionMatrix();
  System.out.println("Confusion matrix: \n" + confusion);

  // Overall statistics
  System.out.println("Accuracy = " + metrics.accuracy());

  // Stats by labels
  for (int i = 0; i < metrics.labels().length; i++) {
    System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(
      metrics.labels()[i]));
    System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(
      metrics.labels()[i]));
    System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(
      metrics.labels()[i]));
  }

  //Weighted stats
  System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
  System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
  System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
  System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());

  // Save and load model
  model.save(sc, "target/tmp/LogisticRegressionModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/LogisticRegressionModel");
  // $example off$
}

Source File: JavaLatentDirichletAllocationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }

Source File: JavaHypothesisTestingExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // a vector composed of the frequencies of events
    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);

    // compute the goodness of fit. If a second vector to test against is not supplied
    // as a parameter, the test runs against a uniform distribution.
    ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
    // summary of the test including the p-value, degrees of freedom, test statistic,
    // the method used, and the null hypothesis.
    System.out.println(goodnessOfFitTestResult + "\n");

    // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});

    // conduct Pearson's independence test on the input contingency matrix
    ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
    // summary of the test including the p-value, degrees of freedom...
    System.out.println(independenceTestResult + "\n");

    // an RDD of labeled points
    JavaRDD<LabeledPoint> obs = jsc.parallelize(
      Arrays.asList(
        new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
        new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
        new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
      )
    );

    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
    // against the label.
    ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
    int i = 1;
    for (ChiSqTestResult result : featureTestResults) {
      System.out.println("Column " + i + ":");
      System.out.println(result + "\n");  // summary of the test
      i++;
    }
    // $example off$

    jsc.stop();
  }

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Convert an ndarray to a matrix.
 * Note that the matrix will be con
 * @param arr the array
 * @return an mllib vector
 */
public static INDArray toMatrix(Matrix arr) {

    // we assume that Matrix always has F order
    return Nd4j.create(arr.toArray(), new int[] {arr.numRows(), arr.numCols()}, 'f');
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Predict the given feature matrix
 *
 * @param features the given feature matrix
 * @return the predictions
 */
public Matrix predict(Matrix features) {
    return MLLibUtil.toMatrix(network.output(MLLibUtil.toMatrix(features)));
}

org.apache.spark.mllib.linalg.Matrix Java Examples