org.apache.spark.mllib.linalg.Matrix Java Examples
The following examples show how to use
org.apache.spark.mllib.linalg.Matrix.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaPCAExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("PCA Example"); SparkContext sc = new SparkContext(conf); // $example on$ double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}}; LinkedList<Vector> rowsList = new LinkedList<>(); for (int i = 0; i < array.length; i++) { Vector currentRow = Vectors.dense(array[i]); rowsList.add(currentRow); } JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList); // Create a RowMatrix from JavaRDD<Vector>. RowMatrix mat = new RowMatrix(rows.rdd()); // Compute the top 3 principal components. Matrix pc = mat.computePrincipalComponents(3); RowMatrix projected = mat.multiply(pc); // $example off$ Vector[] collectPartitions = (Vector[])projected.rows().collect(); System.out.println("Projected vector of principal component:"); for (Vector vector : collectPartitions) { System.out.println("\t" + vector); } }
Example #2
Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
private boolean matrixEquals(Matrix mlMatrix, INDArray indMatrix, Double eps) { final int mlRows = mlMatrix.numRows(); final int mlCols = mlMatrix.numCols(); final int indRows = indMatrix.rows(); final int indCols = indMatrix.columns(); if (mlRows != indRows) return false; if (mlCols != indCols) return false; for (int i = 0; i < mlRows; i++) { for (int j = 0; j < mlCols; j++) { double delta = Math.abs(mlMatrix.apply(i, j) - indMatrix.getDouble(i, j)); if (delta > eps) return false; } } return true; }
Example #3
Source File: JavaSVDExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SVD Example"); SparkContext sc = new SparkContext(conf); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); // $example on$ double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}}; LinkedList<Vector> rowsList = new LinkedList<>(); for (int i = 0; i < array.length; i++) { Vector currentRow = Vectors.dense(array[i]); rowsList.add(currentRow); } JavaRDD<Vector> rows = jsc.parallelize(rowsList); // Create a RowMatrix from JavaRDD<Vector>. RowMatrix mat = new RowMatrix(rows.rdd()); // Compute the top 3 singular values and corresponding singular vectors. SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d); RowMatrix U = svd.U(); Vector s = svd.s(); Matrix V = svd.V(); // $example off$ Vector[] collectPartitions = (Vector[]) U.rows().collect(); System.out.println("U factor is:"); for (Vector vector : collectPartitions) { System.out.println("\t" + vector); } System.out.println("Singular values are: " + s); System.out.println("V factor is:\n" + V); jsc.stop(); }
Example #4
Source File: JavaCorrelationsExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ JavaDoubleRDD seriesX = jsc.parallelizeDoubles( Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series // must have the same number of partitions and cardinality as seriesX JavaDoubleRDD seriesY = jsc.parallelizeDoubles( Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); System.out.println("Correlation is: " + correlation); // note that each Vector is a row and not a column JavaRDD<Vector> data = jsc.parallelize( Arrays.asList( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0) ) ); // calculate the correlation matrix using Pearson's method. // Use "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); System.out.println(correlMatrix.toString()); // $example off$ jsc.stop(); }
Example #5
Source File: SparkConverter.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Convert a local (not distributed) Spark Matrix to an Apache Commons matrix. * * @param r Never {@code null} * @return Not {@code null} */ public static RealMatrix convertSparkMatrixToRealMatrix(final Matrix r){ final RealMatrix result = new Array2DRowRealMatrix(r.numRows(), r.numCols()); final double [] columnMajorMat = r.toArray(); for (int i = 0; i < r.numRows(); i++) { result.setRow(i, Arrays.copyOfRange(columnMajorMat, i * r.numCols(), i * r.numCols() + r.numCols()) ); } return result; }
Example #6
Source File: PCATangentNormalizationUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Tangent normalize given the raw PoN data using Spark: the code here is a little more complex for optimization purposes. * * Please see notes in docs/PoN ... * * Ahat^T = (C^T P^T) A^T * Therefore, C^T is the RowMatrix * * pinv: P * panel: A * projection: Ahat * cases: C * betahat: C^T P^T * tangentNormalizedCounts: C - Ahat */ private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) { // Make the C^T a distributed matrix (RowMatrix) final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix( ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK); // Spark local matrices (transposed) final Matrix pinvTLocalMat = new DenseMatrix( reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose(); final Matrix panelTLocalMat = new DenseMatrix( reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose(); // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed) final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat); final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat); final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix( projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose(); // Subtract the projection from the cases final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection); // Construct the result object and return it with the correct targets. final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix( betahatDistMat, tangentNormalizedCounts.getColumnDimension()); return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts); }
Example #7
Source File: SparkConverter.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Convert a local (not distributed) Spark Matrix to an Apache Commons matrix. * * @param r Never {@code null} * @return Not {@code null} */ public static RealMatrix convertSparkMatrixToRealMatrix(final Matrix r){ final RealMatrix result = new Array2DRowRealMatrix(r.numRows(), r.numCols()); final double [] columnMajorMat = r.toArray(); for (int i = 0; i < r.numRows(); i++) { result.setRow(i, Arrays.copyOfRange(columnMajorMat, i * r.numCols(), i * r.numCols() + r.numCols()) ); } return result; }
Example #8
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert an ndarray to a matrix. * Note that the matrix will be con * @param arr the array * @return an mllib vector */ public static Matrix toMatrix(INDArray arr) { if (!arr.isMatrix()) { throw new IllegalArgumentException("passed in array must be a matrix"); } // if arr is a view - we have to dup anyway if (arr.isView()) { return Matrices.dense(arr.rows(), arr.columns(), arr.dup('f').data().asDouble()); } else // if not a view - we must ensure data is F ordered return Matrices.dense(arr.rows(), arr.columns(), arr.ordering() == 'f' ? arr.data().asDouble() : arr.dup('f').data().asDouble()); }
Example #9
Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testINDtoMLMatrix() { INDArray matIND = Nd4j.rand(23, 100); Matrix matMl = MLLibUtil.toMatrix(matIND); assertTrue(matrixEquals(matMl, matIND, 0.01)); }
Example #10
Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testMltoINDMatrix() { Matrix matMl = Matrices.randn(23, 100, new Random(3949955)); INDArray matIND = MLLibUtil.toMatrix(matMl); log.info("matrix shape: {}", Arrays.toString(matIND.shapeInfoDataBuffer().asInt())); assertTrue(matrixEquals(matMl, matIND, 0.01)); }
Example #11
Source File: TrainingSparkRunner.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
private void multiClassEvaluation(DataFrame predictions, String output, TrainingSettings trainingSettings) throws IOException { FileSystem fs = FileSystem.get(new Configuration()); Path evalPath = new Path(output+"multiclass_evaluation_"+trainingSettings.getClassificationMethod()+".txt"); fs.delete(evalPath, true); FSDataOutputStream fsdos = fs.create(evalPath); MulticlassMetrics metrics = new MulticlassMetrics(predictions .select("prediction", "label")); // Confusion matrix Matrix confusion = metrics.confusionMatrix(); IOUtils.write("\nConfusion matrix: \n" + confusion, fsdos); // Overall statistics IOUtils.write("\nPrecision = " + metrics.precision(), fsdos); IOUtils.write("\nRecall = " + metrics.recall(), fsdos); IOUtils.write("\nF1 Score = " + metrics.fMeasure(), fsdos); IOUtils.write("\n\n", fsdos); // Stats by labels for (int i = 0; i < metrics.labels().length; i++) { IOUtils.write(String.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(metrics.labels()[i])), fsdos); IOUtils.write(String.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])), fsdos); IOUtils.write(String.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])), fsdos); System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(metrics.labels()[i])); System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])); } //Weighted stats IOUtils.write("\nWeighted precision = "+metrics.weightedPrecision(), fsdos); IOUtils.write("\nWeighted recall = "+metrics.weightedRecall(), fsdos); IOUtils.write("\nWeighted F1 score ="+metrics.weightedFMeasure(), fsdos); IOUtils.write("\nWeighted false positive rate = " +metrics.weightedFalsePositiveRate(), fsdos); fsdos.flush(); IOUtils.closeQuietly(fsdos); }
Example #12
Source File: JavaMulticlassClassificationMetricsExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_multiclass_classification_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(3) .run(training.rdd()); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); // Confusion matrix Matrix confusion = metrics.confusionMatrix(); System.out.println("Confusion matrix: \n" + confusion); // Overall statistics System.out.println("Accuracy = " + metrics.accuracy()); // Stats by labels for (int i = 0; i < metrics.labels().length; i++) { System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision( metrics.labels()[i])); System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall( metrics.labels()[i])); System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure( metrics.labels()[i])); } //Weighted stats System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); // Save and load model model.save(sc, "target/tmp/LogisticRegressionModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "target/tmp/LogisticRegressionModel"); // $example off$ }
Example #13
Source File: JavaLatentDirichletAllocationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ // Load and parse the data String path = "data/mllib/sample_lda_data.txt"; JavaRDD<String> data = jsc.textFile(path); JavaRDD<Vector> parsedData = data.map( new Function<String, Vector>() { public Vector call(String s) { String[] sarray = s.trim().split(" "); double[] values = new double[sarray.length]; for (int i = 0; i < sarray.length; i++) { values[i] = Double.parseDouble(sarray[i]); } return Vectors.dense(values); } } ); // Index documents with unique IDs JavaPairRDD<Long, Vector> corpus = JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map( new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() { public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) { return doc_id.swap(); } } ) ); corpus.cache(); // Cluster the documents into three topics using LDA LDAModel ldaModel = new LDA().setK(3).run(corpus); // Output topics. Each is a distribution over words (matching word count vectors) System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize() + " words):"); Matrix topics = ldaModel.topicsMatrix(); for (int topic = 0; topic < 3; topic++) { System.out.print("Topic " + topic + ":"); for (int word = 0; word < ldaModel.vocabSize(); word++) { System.out.print(" " + topics.apply(word, topic)); } System.out.println(); } ldaModel.save(jsc.sc(), "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel"); DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(), "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel"); // $example off$ jsc.stop(); }
Example #14
Source File: JavaHypothesisTestingExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ // a vector composed of the frequencies of events Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // compute the goodness of fit. If a second vector to test against is not supplied // as a parameter, the test runs against a uniform distribution. ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); // summary of the test including the p-value, degrees of freedom, test statistic, // the method used, and the null hypothesis. System.out.println(goodnessOfFitTestResult + "\n"); // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // conduct Pearson's independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... System.out.println(independenceTestResult + "\n"); // an RDD of labeled points JavaRDD<LabeledPoint> obs = jsc.parallelize( Arrays.asList( new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) ) ); // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); int i = 1; for (ChiSqTestResult result : featureTestResults) { System.out.println("Column " + i + ":"); System.out.println(result + "\n"); // summary of the test i++; } // $example off$ jsc.stop(); }
Example #15
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Convert an ndarray to a matrix. * Note that the matrix will be con * @param arr the array * @return an mllib vector */ public static INDArray toMatrix(Matrix arr) { // we assume that Matrix always has F order return Nd4j.create(arr.toArray(), new int[] {arr.numRows(), arr.numCols()}, 'f'); }
Example #16
Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Predict the given feature matrix * * @param features the given feature matrix * @return the predictions */ public Matrix predict(Matrix features) { return MLLibUtil.toMatrix(network.output(MLLibUtil.toMatrix(features))); }