org.apache.spark.mllib.linalg.distributed.RowMatrix Java Examples

The following examples show how to use org.apache.spark.mllib.linalg.distributed.RowMatrix. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaPCAExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("PCA Example");
  SparkContext sc = new SparkContext(conf);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 principal components.
  Matrix pc = mat.computePrincipalComponents(3);
  RowMatrix projected = mat.multiply(pc);
  // $example off$
  Vector[] collectPartitions = (Vector[])projected.rows().collect();
  System.out.println("Projected vector of principal component:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
}
 
Example #2
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example #3
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create an Apache Commons RealMatrix from a Spark RowMatrix.
 *
 * @param r Never {@code null}
 * @param cachedNumRows Checking the number of rows in {@code r} can be time-consuming.  Provide the value here, if it is already known.
 *                      Use {@code -1} if unknown.
 * @return Never {@code null}
 */
public static RealMatrix convertSparkRowMatrixToRealMatrix(final RowMatrix r, final int cachedNumRows) {

    Utils.nonNull(r, "Input row matrix cannot be null");

    int numRows;
    if (cachedNumRows == -1) {
        // This takes a while in Spark
        numRows = (int) r.numRows();
    } else {
        numRows = cachedNumRows;
    }

    final int numCols = (int) r.numCols();

    // This cast is required, even though it would not seem necessary, at first.  Exact reason why is unknown.
    //   Will fail compilation if the cast is removed.
    final Vector [] rowVectors = (Vector []) r.rows().collect();

    final RealMatrix result = new Array2DRowRealMatrix(numRows, numCols);
    for (int i = 0; i < numRows; i++) {
        result.setRow(i, rowVectors[i].toArray() );
    }
    return result;
}
 
Example #4
Source File: SparkConverter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example #5
Source File: SparkConverter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create an Apache Commons RealMatrix from a Spark RowMatrix.
 *
 * @param r Never {@code null}
 * @param cachedNumRows Checking the number of rows in {@code r} can be time-consuming.  Provide the value here, if it is already known.
 *                      Use {@code -1} if unknown.
 * @return Never {@code null}
 */
public static RealMatrix convertSparkRowMatrixToRealMatrix(final RowMatrix r, final int cachedNumRows) {

    Utils.nonNull(r, "Input row matrix cannot be null");

    int numRows;
    if (cachedNumRows == -1) {
        // This takes a while in Spark
        numRows = (int) r.numRows();
    } else {
        numRows = cachedNumRows;
    }

    final int numCols = (int) r.numCols();

    // This cast is required, even though it would not seem necessary, at first.  Exact reason why is unknown.
    //   Will fail compilation if the cast is removed.
    final Vector [] rowVectors = (Vector []) r.rows().collect();

    final RealMatrix result = new Array2DRowRealMatrix(numRows, numCols);
    for (int i = 0; i < numRows; i++) {
        result.setRow(i, rowVectors[i].toArray() );
    }
    return result;
}
 
Example #6
Source File: JavaSVDExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

  jsc.stop();
}
 
Example #7
Source File: PCATangentNormalizationUtils.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Tangent normalize given the raw PoN data using Spark:  the code here is a little more complex for optimization purposes.
 *
 *  Please see notes in docs/PoN ...
 *
 *  Ahat^T = (C^T P^T) A^T
 *  Therefore, C^T is the RowMatrix
 *
 *  pinv: P
 *  panel: A
 *  projection: Ahat
 *  cases: C
 *  betahat: C^T P^T
 *  tangentNormalizedCounts: C - Ahat
 */
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts,
                                                                   final RealMatrix reducedPanelCounts,
                                                                   final RealMatrix reducedPanelPInvCounts,
                                                                   final CaseToPoNTargetMapper targetMapper,
                                                                   final RealMatrix tangentNormalizationInputCounts,
                                                                   final JavaSparkContext ctx) {
    // Make the C^T a distributed matrix (RowMatrix)
    final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(
            ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);

    // Spark local matrices (transposed)
    final Matrix pinvTLocalMat = new DenseMatrix(
            reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(),
            Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
    final Matrix panelTLocalMat = new DenseMatrix(
            reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(),
            Doubles.concat(reducedPanelCounts.getData()), true).transpose();

    // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
    final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
    final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
    final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(
            projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();

    // Subtract the projection from the cases
    final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);

    // Construct the result object and return it with the correct targets.
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(
            tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(
            tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(
            betahatDistMat, tangentNormalizedCounts.getColumnDimension());

    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
 
Example #8
Source File: SparkSingularValueDecomposer.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 *  Create a SVD of the given matrix using the given Java Spark Context.
 *
 * @param realMat the matrix target.  Not {@code null}
 * @return never {@code null}
 */
@Override
public SVD createSVD(final RealMatrix realMat){
Utils.nonNull(realMat, "Cannot perform Spark MLLib SVD on a null matrix.");

    final RowMatrix mat = SparkConverter.convertRealMatrixToSparkRowMatrix(sc, realMat, NUM_SLICES);

    // Compute all of the singular values and corresponding singular vectors.
    final SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD((int) mat.numCols(), true, 1.0E-9d);

    // Get our distributed results
    final RowMatrix u = svd.U();
    final Vector s = svd.s();
    final Matrix v = svd.V().transpose();

    // Move the matrices from Spark/distributed space to Apache Commons space
    logger.info("Converting distributed Spark matrix to local matrix...");
    final RealMatrix uReal = SparkConverter.convertSparkRowMatrixToRealMatrix(u, realMat.getRowDimension());
    logger.info("Done converting distributed Spark matrix to local matrix...");
    logger.info("Converting Spark matrix to local matrix...");
    final RealMatrix vReal = SparkConverter.convertSparkMatrixToRealMatrix(v);
    logger.info("Done converting Spark matrix to local matrix...");
    final double [] singularValues = s.toArray();

    logger.info("Calculating the pseudoinverse...");
    logger.info("Pinv: calculating tolerance...");

    // Note that the pinv of realMat is V * invS * U'
    final double tolerance = Math.max(realMat.getColumnDimension(), realMat.getRowDimension()) * realMat.getNorm() * EPS;
    logger.info("Pinv: inverting the singular values (with tolerance) and creating a diagonal matrix...");
    final double[] invS = Arrays.stream(singularValues).map(sv -> invertSVWithTolerance(sv, tolerance)).toArray();

    final Matrix invSMat = Matrices.diag(Vectors.dense(invS));
    logger.info("Pinv: Multiplying V * invS * U' to get the pinv (using pinv transpose = U * invS' * V') ...");
    final RowMatrix pinvT = u.multiply(invSMat).multiply(v);
    logger.info("Pinv: Converting back to local matrix ...");
    final RealMatrix pinv = SparkConverter.convertSparkRowMatrixToRealMatrix(pinvT, realMat.getRowDimension()).transpose();
    logger.info("Done calculating the pseudoinverse and converting it...");

    return new SimpleSVD(uReal, s.toArray(), vReal, pinv);
}