org.apache.spark.ml.linalg.Vectors Java Examples
The following examples show how to use
org.apache.spark.ml.linalg.Vectors.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #2
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #3
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #4
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column"); List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>(); list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #5
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #6
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column"); List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #7
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #8
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column, no format specified"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #9
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column"); List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #10
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #11
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column"); List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>(); list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #12
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #13
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #14
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #15
Source File: JavaAFTSurvivalRegressionExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaAFTSurvivalRegressionExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)), RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)), RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)), RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)), RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226)) ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> training = spark.createDataFrame(data, schema); double[] quantileProbabilities = new double[]{0.3, 0.6}; AFTSurvivalRegression aft = new AFTSurvivalRegression() .setQuantileProbabilities(quantileProbabilities) .setQuantilesCol("quantiles"); AFTSurvivalRegressionModel model = aft.fit(training); // Print the coefficients, intercept and scale parameter for AFT survival regression System.out.println("Coefficients: " + model.coefficients()); System.out.println("Intercept: " + model.intercept()); System.out.println("Scale: " + model.scale()); model.transform(training).show(false); // $example off$ spark.stop(); }
Example #16
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
private static Vector createVector(MatrixBlock row) { if( row.isEmptyBlock(false) ) //EMPTY SPARSE ROW return Vectors.sparse(row.getNumColumns(), new int[0], new double[0]); else if( row.isInSparseFormat() ) //SPARSE ROW return Vectors.sparse(row.getNumColumns(), row.getSparseBlock().indexes(0), row.getSparseBlock().values(0)); else // DENSE ROW return Vectors.dense(row.getDenseBlockValues()); }
Example #17
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
private static Vector createVector(MatrixBlock row) { if( row.isEmptyBlock(false) ) //EMPTY SPARSE ROW return Vectors.sparse(row.getNumColumns(), new int[0], new double[0]); else if( row.isInSparseFormat() ) //SPARSE ROW return Vectors.sparse(row.getNumColumns(), row.getSparseBlock().indexes(0), row.getSparseBlock().values(0)); else // DENSE ROW return Vectors.dense(row.getDenseBlockValues()); }
Example #18
Source File: MinMaxScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testMinMaxScaler() { //prepare data JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(1.0, Vectors.dense(data[0])), RowFactory.create(2.0, Vectors.dense(data[1])), RowFactory.create(3.0, Vectors.dense(data[2])), RowFactory.create(4.0, Vectors.dense(data[3])) )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); //train model in spark MinMaxScalerModel sparkModel = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModel); final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collectAsList(); assertCorrectness(sparkOutput, expected, transformer); }
Example #19
Source File: SimplePredictionFromTextFile.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "Simple prediction from Text File").master("local").getOrCreate(); spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT()); String filename = "data/tuple-data-file.csv"; StructType schema = new StructType( new StructField[] { new StructField("_c0", DataTypes.DoubleType, false, Metadata.empty()), new StructField("_c1", DataTypes.DoubleType, false, Metadata .empty()), new StructField("features", new VectorUDT(), true, Metadata .empty()), }); Dataset<Row> df = spark.read().format("csv").schema(schema).option("header", "false") .load(filename); df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0"); df = df.withColumn("label", df.col("_c1")).drop("_c1"); df.printSchema(); df = df.withColumn("features", callUDF("vectorBuilder", df.col( "valuefeatures"))); df.printSchema(); df.show(); LinearRegression lr = new LinearRegression().setMaxIter(20);// .setRegParam(1).setElasticNetParam(1); // Fit the model to the data. LinearRegressionModel model = lr.fit(df); // Given a dataset, predict each point's label, and show the results. model.transform(df).show(); LinearRegressionTrainingSummary trainingSummary = model.summary(); System.out.println("numIterations: " + trainingSummary.totalIterations()); System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary .objectiveHistory())); trainingSummary.residuals().show(); System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError()); System.out.println("r2: " + trainingSummary.r2()); double intercept = model.intercept(); System.out.println("Interesection: " + intercept); double regParam = model.getRegParam(); System.out.println("Regression parameter: " + regParam); double tol = model.getTol(); System.out.println("Tol: " + tol); Double feature = 7.0; Vector features = Vectors.dense(feature); double p = model.predict(features); System.out.println("Prediction for feature " + feature + " is " + p); System.out.println(8 * regParam + intercept); }
Example #20
Source File: JavaElementwiseProductExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaElementwiseProductExample") .getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors List<Row> data = Arrays.asList( RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)) ); List<StructField> fields = new ArrayList<StructField>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); ElementwiseProduct transformer = new ElementwiseProduct() .setScalingVec(transformingVector) .setInputCol("vector") .setOutputCol("transformedVector"); // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ spark.stop(); }
Example #21
Source File: JavaPolynomialExpansionExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaPolynomialExpansionExample") .getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3); List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(2.0, 1.0)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(3.0, -1.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); Dataset<Row> polyDF = polyExpansion.transform(df); polyDF.show(false); // $example off$ spark.stop(); }
Example #22
Source File: JavaPCAExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaPCAExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})), RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); PCAModel pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df); Dataset<Row> result = pca.transform(df).select("pcaFeatures"); result.show(false); // $example off$ spark.stop(); }
Example #23
Source File: JavaDCTExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaDCTExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); DCT dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") .setInverse(false); Dataset<Row> dctDf = dct.transform(df); dctDf.select("featuresDCT").show(false); // $example off$ spark.stop(); }
Example #24
Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * One-hot encodes a protein sequence. The one-hot encoding * encodes the 20 natural amino acids, plus X for any other * residue for a total of 21 elements per residue. * * @return dataset with feature vector appended */ public Dataset<Row> oneHotEncode() { SparkSession session = data.sparkSession(); int maxLength = getMaxSequenceLength(data); session.udf().register("encoder", new UDF1<String, Vector>() { private static final long serialVersionUID = -6095318836772114908L; @Override public Vector call(String s) throws Exception { int len = AMINO_ACIDS21.size(); double[] values = new double[len * maxLength]; char[] seq = s.toCharArray(); for (int i = 0; i < seq.length; i++) { int index = AMINO_ACIDS21.indexOf(seq[i]); // replace any non-matching code, e.g., U, with X if (index == -1) { index = AMINO_ACIDS21.indexOf('X'); } values[i * len + index] = 1; } return Vectors.dense(values); } }, new VectorUDT()); // append feature column data.createOrReplaceTempView("table"); data = session.sql("SELECT *, encoder(" + inputCol + ") AS " + outputCol + " from table"); return data; }
Example #25
Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Encodes a protein sequence by 7 physicochemical * properties. * * <p> See: Meiler, J., Müller, M., Zeidler, A. et al. J Mol Model (2001) 7: 360. doi: * <a href="https://link.springer.com/article/10.1007/s008940100038">10.1007/s008940100038</a> * * @return dataset with feature vector appended */ public Dataset<Row> propertyEncode() { SparkSession session = data.sparkSession(); int maxLength = getMaxSequenceLength(data); session.udf().register("encoder", new UDF1<String, Vector>(){ private static final long serialVersionUID = 1L; @Override public Vector call(String s) throws Exception { double[] values = new double[7*maxLength]; for (int i = 0, k = 0; i < s.length(); i++) { double[] property = properties.get(s.charAt(i)); if (property != null) { for (double p: property) { values[k++] = p; } } } return Vectors.dense(values); } }, new VectorUDT()); // append feature column data.createOrReplaceTempView("table"); data = session.sql("SELECT *, encoder(" + inputCol + ") AS " + outputCol + " from table"); return data; }
Example #26
Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Encodes a protein sequence by a Blosum62 matrix. * * <p> See: <a href="https://ftp.ncbi.nih.gov/repository/blocks/unix/blosum/BLOSUM/blosum62.blast.new">BLOSUM62 Matrix</a> * * @return dataset with feature vector appended */ public Dataset<Row> blosum62Encode() { SparkSession session = data.sparkSession(); int maxLength = getMaxSequenceLength(data); session.udf().register("encoder", new UDF1<String, Vector>(){ private static final long serialVersionUID = 1L; @Override public Vector call(String s) throws Exception { double[] values = new double[20*maxLength]; for (int i = 0, k = 0; i < s.length(); i++) { double[] property = blosum62.get(s.charAt(i)); if (property != null) { for (double p: property) { values[k++] = p; } } } return Vectors.dense(values); } }, new VectorUDT()); // append feature column data.createOrReplaceTempView("table"); data = session.sql("SELECT *, encoder(" + inputCol + ") AS " + outputCol + " from table"); return data; }
Example #27
Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0 | 5 votes |
private static Dataset<Row> averageFeatureVectors(Dataset<Row> data, String outputCol) { SparkSession session = data.sparkSession(); session.udf().register("averager", new UDF3<Vector, Vector, Vector, Vector>() { private static final long serialVersionUID = -8190379199020903671L; @Override public Vector call(Vector v1, Vector v2, Vector v3) throws Exception { double[] f1 = v1.toArray(); double[] f2 = v2.toArray(); double[] f3 = v3.toArray(); // arrays may be of different length int len = Math.min(Math.min(f1.length, f2.length), f3.length); double[] average = new double[len]; for (int i = 0; i < len; i++) { average[i] = (f1[i] + f2[i] + f3[i]) / 3.0; } return Vectors.dense(average); } }, new VectorUDT()); data.createOrReplaceTempView("table"); // append new feature column with average values return session.sql("SELECT *, averager(features0,features1,features2) AS " + outputCol + " from table"); }
Example #28
Source File: JavaVectorAssemblerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorAssemblerExample") .getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("hour", IntegerType, false), createStructField("mobile", DoubleType, false), createStructField("userFeatures", new VectorUDT(), false), createStructField("clicked", DoubleType, false) }); Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + "'features'"); output.select("features", "clicked").show(false); // $example off$ spark.stop(); }
Example #29
Source File: JavaMinHashLSHExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaMinHashLSHExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})), RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})), RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0})) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("keys", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MinHashLSH mh = new MinHashLSH() .setNumHashTables(1) .setInputCol("keys") .setOutputCol("values"); MinHashLSHModel model = mh.fit(dataFrame); model.transform(dataFrame).show(); // $example off$ spark.stop(); }
Example #30
Source File: JavaNormalizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaNormalizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normFeatures") .setP(1.0); Dataset<Row> l1NormData = normalizer.transform(dataFrame); l1NormData.show(); // Normalize each Vector using $L^\infty$ norm. Dataset<Row> lInfNormData = normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ spark.stop(); }