org.apache.spark.ml.linalg.SparseVector Java Examples
The following examples show how to use
org.apache.spark.ml.linalg.SparseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RemoteDPParForSpark.java From systemds with Apache License 2.0 | 5 votes |
@Override public Tuple2<Long, Writable> call(Tuple2<Row, Long> arg0) throws Exception { long rowix = arg0._2() + 1; //process row data int off = _containsID ? 1: 0; Object obj = _isVector ? arg0._1().get(off) : arg0._1(); boolean sparse = (obj instanceof SparseVector); MatrixBlock mb = new MatrixBlock(1, (int)_clen, sparse); if( _isVector ) { Vector vect = (Vector) obj; if( vect instanceof SparseVector ) { SparseVector svect = (SparseVector) vect; int lnnz = svect.numNonzeros(); for( int k=0; k<lnnz; k++ ) mb.appendValue(0, svect.indices()[k], svect.values()[k]); } else { //dense for( int j=0; j<_clen; j++ ) mb.appendValue(0, j, vect.apply(j)); } } else { //row Row row = (Row) obj; for( int j=off; j<off+_clen; j++ ) mb.appendValue(0, j-off, UtilFunctions.getDouble(row.get(j))); } mb.examSparsity(); return new Tuple2<>(rowix, new PairWritableBlock(new MatrixIndexes(1,1),mb)); }
Example #2
Source File: GradientBoostClassificationModelTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testGradientBoostClassification() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/binary_classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a RandomForest model. GBTClassificationModel classificationModel = new GBTClassifier().fit(trainingData); byte[] exportedModel = ModelExporter.export(classificationModel); Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> sparkOutput = classificationModel.transform(testData).select("features", "prediction","label").collectAsList(); // compare predictions for (Row row : sparkOutput) { Map<String, Object> data_ = new HashMap<>(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(2)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")+" ,"+row.get(1)); assertEquals((double) data_.get("prediction"), (double) row.get(1), EPSILON); } }
Example #3
Source File: DecisionTreeRegressionModelBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testDecisionTreeRegressionPrediction() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/regression_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a DecisionTree model. DecisionTreeRegressionModel regressionModel = new DecisionTreeRegressor().fit(trainingData); trainingData.printSchema(); List<Row> output = regressionModel.transform(testData).select("features", "prediction").collectAsList(); byte[] exportedModel = ModelExporter.export(regressionModel); DecisionTreeTransformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel); System.out.println(transformer); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); data_.put("features", ((SparseVector) row.get(0)).toArray()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON); } }
Example #4
Source File: DecisionTreeClassificationModelBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testDecisionTreeClassificationPrediction() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a DecisionTree model. DecisionTreeClassificationModel classifierModel = new DecisionTreeClassifier().fit(trainingData); trainingData.printSchema(); List<Row> output = classifierModel.transform(testData).select("features", "prediction","rawPrediction").collectAsList(); byte[] exportedModel = ModelExporter.export(classifierModel); DecisionTreeTransformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); double [] actualRawPrediction = ((DenseVector) row.get(2)).toArray(); data_.put("features", ((SparseVector) row.get(0)).toArray()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON); assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON); } }
Example #5
Source File: RemoteDPParForSpark.java From systemds with Apache License 2.0 | 5 votes |
@Override public Tuple2<Long, Writable> call(Tuple2<Row, Long> arg0) throws Exception { long rowix = arg0._2() + 1; //process row data int off = _containsID ? 1: 0; Object obj = _isVector ? arg0._1().get(off) : arg0._1(); boolean sparse = (obj instanceof SparseVector); MatrixBlock mb = new MatrixBlock(1, (int)_clen, sparse); if( _isVector ) { Vector vect = (Vector) obj; if( vect instanceof SparseVector ) { SparseVector svect = (SparseVector) vect; int lnnz = svect.numNonzeros(); for( int k=0; k<lnnz; k++ ) mb.appendValue(0, svect.indices()[k], svect.values()[k]); } else { //dense for( int j=0; j<_clen; j++ ) mb.appendValue(0, j, vect.apply(j)); } } else { //row Row row = (Row) obj; for( int j=off; j<off+_clen; j++ ) mb.appendValue(0, j-off, UtilFunctions.getDouble(row.get(j))); } mb.examSparsity(); return new Tuple2<>(rowix, new PairWritableBlock(new MatrixIndexes(1,1),mb)); }
Example #6
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
@Override public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<Tuple2<org.apache.spark.mllib.regression.LabeledPoint,Long>> arg0) throws Exception { ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<>(); int ncblks = (int)Math.ceil((double)_clen/_blen); MatrixIndexes[] ix = new MatrixIndexes[ncblks]; MatrixBlock[] mb = new MatrixBlock[ncblks]; while( arg0.hasNext() ) { Tuple2<org.apache.spark.mllib.regression.LabeledPoint,Long> tmp = arg0.next(); org.apache.spark.mllib.regression.LabeledPoint row = tmp._1(); boolean lsparse = _sparseX || (!_labels && row.features() instanceof org.apache.spark.mllib.linalg.SparseVector); long rowix = tmp._2() + 1; long rix = UtilFunctions.computeBlockIndex(rowix, _blen); int pos = UtilFunctions.computeCellInBlock(rowix, _blen); //create new blocks for entire row if( ix[0] == null || ix[0].getRowIndex() != rix ) { if( ix[0] !=null ) flushBlocksToList(ix, mb, ret); long len = UtilFunctions.computeBlockSize(_rlen, rix, _blen); createBlocks(rowix, (int)len, ix, mb, lsparse); } //process row data if( _labels ) { double val = row.label(); mb[0].appendValue(pos, 0, val); _aNnz.add((val != 0) ? 1 : 0); } else { //features int lnnz = row.features().numNonzeros(); if( row.features() instanceof org.apache.spark.mllib.linalg.SparseVector ) { org.apache.spark.mllib.linalg.SparseVector srow = (org.apache.spark.mllib.linalg.SparseVector) row.features(); for( int k=0; k<lnnz; k++ ) { int gix = srow.indices()[k]+1; int cix = (int)UtilFunctions.computeBlockIndex(gix, _blen); int j = UtilFunctions.computeCellInBlock(gix, _blen); mb[cix-1].appendValue(pos, j, srow.values()[k]); } } else { //dense for( int cix=1, pix=0; cix<=ncblks; cix++ ) { int lclen = UtilFunctions.computeBlockSize(_clen, cix, _blen); for( int j=0; j<lclen; j++ ) mb[cix-1].appendValue(pos, j, row.features().apply(pix++)); } } _aNnz.add(lnnz); } } //flush last blocks flushBlocksToList(ix, mb, ret); return ret.iterator(); }
Example #7
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
@Override public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<Tuple2<Row, Long>> arg0) throws Exception { ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<>(); int ncblks = (int)Math.ceil((double)_clen/_blen); MatrixIndexes[] ix = new MatrixIndexes[ncblks]; MatrixBlock[] mb = new MatrixBlock[ncblks]; while( arg0.hasNext() ) { Tuple2<Row,Long> tmp = arg0.next(); long rowix = tmp._2() + 1; long rix = UtilFunctions.computeBlockIndex(rowix, _blen); int pos = UtilFunctions.computeCellInBlock(rowix, _blen); //create new blocks for entire row if( ix[0] == null || ix[0].getRowIndex() != rix ) { if( ix[0] !=null ) flushBlocksToList(ix, mb, ret); long len = UtilFunctions.computeBlockSize(_rlen, rix, _blen); createBlocks(rowix, (int)len, ix, mb); } //process row data int off = _containsID ? 1 : 0; Object obj = _isVector ? tmp._1().get(off) : tmp._1(); for( int cix=1, pix=_isVector?0:off; cix<=ncblks; cix++ ) { int lclen = UtilFunctions.computeBlockSize(_clen, cix, _blen); int cu = (int) Math.min(_clen, cix*_blen) + (_isVector?0:off); //allocate sparse row once (avoid re-allocations) if( mb[cix-1].isInSparseFormat() ) { int lnnz = countNnz(obj, _isVector, pix, cu); mb[cix-1].getSparseBlock().allocate(pos, lnnz); } //append data to matrix blocks if( _isVector ) { Vector vect = (Vector) obj; if( vect instanceof SparseVector ) { SparseVector svect = (SparseVector) vect; int[] svectIx = svect.indices(); while( pix<svectIx.length && svectIx[pix]<cu ) { int j = UtilFunctions.computeCellInBlock(svectIx[pix]+1, _blen); mb[cix-1].appendValue(pos, j, svect.values()[pix++]); } } else { //dense for( int j=0; j<lclen; j++ ) mb[cix-1].appendValue(pos, j, vect.apply(pix++)); } } else { //row Row row = (Row) obj; for( int j=0; j<lclen; j++ ) mb[cix-1].appendValue(pos, j, UtilFunctions.getDouble(row.get(pix++))); } } } //flush last blocks flushBlocksToList(ix, mb, ret); return ret.iterator(); }
Example #8
Source File: DatasetClassifier.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] path to parquet file, args[1] name of classification column * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>"); System.exit(1); } // name of the class label String label = args[1]; long start = System.nanoTime(); SparkSession spark = SparkSession .builder() .master("local[*]") .appName(DatasetClassifier.class.getSimpleName()) .getOrCreate(); Dataset<Row> data = spark.read().parquet(args[0]).cache(); int featureCount = 0; Object vector = data.first().getAs("features"); if (vector instanceof DenseVector) { featureCount = ((DenseVector)vector).numActives(); } else if (vector instanceof SparseVector) { featureCount = ((SparseVector)vector).numActives(); } System.out.println("Feature count : " + featureCount); int classCount = (int)data.select(label).distinct().count(); System.out.println("Class count : " + classCount); System.out.println("Dataset size (unbalanced): " + data.count()); data.groupBy(label).count().show(classCount); data = DatasetBalancer.downsample(data, label, 1); System.out.println("Dataset size (balanced) : " + data.count()); data.groupBy(label).count().show(classCount); double testFraction = 0.3; long seed = 123; SparkMultiClassClassifier mcc; Map<String, String> metrics; DecisionTreeClassifier dtc = new DecisionTreeClassifier(); mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); RandomForestClassifier rfc = new RandomForestClassifier(); mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); LogisticRegression lr = new LogisticRegression(); mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); // specify layers for the neural network // input layer: dimension of feature vector // output layer: number of classes int[] layers = new int[] {featureCount, 10, classCount}; MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(200); mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec"); }
Example #9
Source File: DecisionTreeRegressionModelBridgePipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeRegressionPrediction() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/regression_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex").setHandleInvalid("skip"); DecisionTreeRegressor regressionModel = new DecisionTreeRegressor().setLabelCol("labelIndex").setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, regressionModel}); PipelineModel sparkPipeline = pipeline.fit(trainingData); byte[] exportedModel = ModelExporter.export(sparkPipeline); Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> output = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList(); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(2)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON); } }
Example #10
Source File: GradientBoostClassificationModelPipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testGradientBoostClassification() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/binary_classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a RandomForest model. GBTClassifier classificationModel = new GBTClassifier().setLabelCol("labelIndex") .setFeaturesCol("features");; Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classificationModel}); PipelineModel sparkPipeline = pipeline.fit(trainingData); // Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline); // Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> sparkOutput = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList(); // compare predictions for (Row row : sparkOutput) { Map<String, Object> data_ = new HashMap<>(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(2)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")+" ,"+row.get(1)); assertEquals((double) data_.get("prediction"), (double) row.get(1), EPSILON); } }
Example #11
Source File: DecisionTreeClassificationModelBridgePipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeClassificationWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Train a DecisionTree model. DecisionTreeClassifier classificationModel = new DecisionTreeClassifier() .setLabelCol("labelIndex") .setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classificationModel}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList(); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(1)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON); assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON); } }
Example #12
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
@Override public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<Tuple2<org.apache.spark.mllib.regression.LabeledPoint,Long>> arg0) throws Exception { ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<>(); int ncblks = (int)Math.ceil((double)_clen/_blen); MatrixIndexes[] ix = new MatrixIndexes[ncblks]; MatrixBlock[] mb = new MatrixBlock[ncblks]; while( arg0.hasNext() ) { Tuple2<org.apache.spark.mllib.regression.LabeledPoint,Long> tmp = arg0.next(); org.apache.spark.mllib.regression.LabeledPoint row = tmp._1(); boolean lsparse = _sparseX || (!_labels && row.features() instanceof org.apache.spark.mllib.linalg.SparseVector); long rowix = tmp._2() + 1; long rix = UtilFunctions.computeBlockIndex(rowix, _blen); int pos = UtilFunctions.computeCellInBlock(rowix, _blen); //create new blocks for entire row if( ix[0] == null || ix[0].getRowIndex() != rix ) { if( ix[0] !=null ) flushBlocksToList(ix, mb, ret); long len = UtilFunctions.computeBlockSize(_rlen, rix, _blen); createBlocks(rowix, (int)len, ix, mb, lsparse); } //process row data if( _labels ) { double val = row.label(); mb[0].appendValue(pos, 0, val); _aNnz.add((val != 0) ? 1 : 0); } else { //features int lnnz = row.features().numNonzeros(); if( row.features() instanceof org.apache.spark.mllib.linalg.SparseVector ) { org.apache.spark.mllib.linalg.SparseVector srow = (org.apache.spark.mllib.linalg.SparseVector) row.features(); for( int k=0; k<lnnz; k++ ) { int gix = srow.indices()[k]+1; int cix = (int)UtilFunctions.computeBlockIndex(gix, _blen); int j = UtilFunctions.computeCellInBlock(gix, _blen); mb[cix-1].appendValue(pos, j, srow.values()[k]); } } else { //dense for( int cix=1, pix=0; cix<=ncblks; cix++ ) { int lclen = UtilFunctions.computeBlockSize(_clen, cix, _blen); for( int j=0; j<lclen; j++ ) mb[cix-1].appendValue(pos, j, row.features().apply(pix++)); } } _aNnz.add(lnnz); } } //flush last blocks flushBlocksToList(ix, mb, ret); return ret.iterator(); }
Example #13
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
@Override public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<Tuple2<Row, Long>> arg0) throws Exception { ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<>(); int ncblks = (int)Math.ceil((double)_clen/_blen); MatrixIndexes[] ix = new MatrixIndexes[ncblks]; MatrixBlock[] mb = new MatrixBlock[ncblks]; while( arg0.hasNext() ) { Tuple2<Row,Long> tmp = arg0.next(); long rowix = tmp._2() + 1; long rix = UtilFunctions.computeBlockIndex(rowix, _blen); int pos = UtilFunctions.computeCellInBlock(rowix, _blen); //create new blocks for entire row if( ix[0] == null || ix[0].getRowIndex() != rix ) { if( ix[0] !=null ) flushBlocksToList(ix, mb, ret); long len = UtilFunctions.computeBlockSize(_rlen, rix, _blen); createBlocks(rowix, (int)len, ix, mb); } //process row data int off = _containsID ? 1 : 0; Object obj = _isVector ? tmp._1().get(off) : tmp._1(); for( int cix=1, pix=_isVector?0:off; cix<=ncblks; cix++ ) { int lclen = UtilFunctions.computeBlockSize(_clen, cix, _blen); int cu = (int) Math.min(_clen, cix*_blen) + (_isVector?0:off); //allocate sparse row once (avoid re-allocations) if( mb[cix-1].isInSparseFormat() ) { int lnnz = countNnz(obj, _isVector, pix, cu); mb[cix-1].getSparseBlock().allocate(pos, lnnz); } //append data to matrix blocks if( _isVector ) { Vector vect = (Vector) obj; if( vect instanceof SparseVector ) { SparseVector svect = (SparseVector) vect; int[] svectIx = svect.indices(); while( pix<svectIx.length && svectIx[pix]<cu ) { int j = UtilFunctions.computeCellInBlock(svectIx[pix]+1, _blen); mb[cix-1].appendValue(pos, j, svect.values()[pix++]); } } else { //dense for( int j=0; j<lclen; j++ ) mb[cix-1].appendValue(pos, j, vect.apply(pix++)); } } else { //row Row row = (Row) obj; for( int j=0; j<lclen; j++ ) mb[cix-1].appendValue(pos, j, UtilFunctions.getDouble(row.get(pix++))); } } } //flush last blocks flushBlocksToList(ix, mb, ret); return ret.iterator(); }