Python pyspark.ml.linalg.Vectors.dense() Examples
The following are 30
code examples of pyspark.ml.linalg.Vectors.dense().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml.linalg.Vectors
, or try the search function
.
Example #1
Source File: test_scaler.py From onnxmltools with MIT License | 6 votes |
def test_maxabs_scaler(self): data = self.spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), (2, Vectors.dense([3.0, 10.1, 3.0]),) ], ["id", "features"]) scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #2
Source File: test_normalizer.py From onnxmltools with MIT License | 6 votes |
def test_model_normalizer_2(self): data = self.spark.createDataFrame([ (0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) ]).toDF("id", "features") model = Normalizer(inputCol='features', outputCol='norm_feature', p=2.0) model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().norm_feature.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #3
Source File: test_aft_survival_regression.py From onnxmltools with MIT License | 6 votes |
def test_aft_regression_survival(self): data = self.spark.createDataFrame([ (1.0, Vectors.dense(1.0), 1.0), (1e-40, Vectors.sparse(1, [], []), 0.0) ], ["label", "features", "censor"]) gbt = AFTSurvivalRegression() model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml AFTSurvivalRegression', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlAFTSurvivalRegression") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #4
Source File: test_spark_dataset_converter.py From petastorm with Apache License 2.0 | 6 votes |
def test_vector_to_array(spark_test_ctx): from pyspark.ml.linalg import Vectors from pyspark.mllib.linalg import Vectors as OldVectors df = spark_test_ctx.spark.createDataFrame([ (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)), (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0)) ], ["vec", "oldVec"]) converter1 = make_spark_converter(df) with converter1.make_tf_dataset(num_epochs=1) as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float32 == ts.vec.dtype.type assert np.float32 == ts.oldVec.dtype.type vec_col = ts.vec[ts.vec[:, 0].argsort()] old_vec_col = ts.oldVec[ts.oldVec[:, 0].argsort()] assert (2, 3) == ts.vec.shape assert (2, 3) == ts.oldVec.shape assert ([1., 2., 3.] == vec_col[0]).all() and \ ([5., 6., 7.] == vec_col[1]).all() assert ([10., 20., 30.] == old_vec_col[0]).all() and \ ([50., 60., 70] == old_vec_col[1]).all()
Example #5
Source File: test_decision_tree_classifier.py From onnxmltools with MIT License | 6 votes |
def test_tree_one_class_classification(self): features = [[0., 1.], [1., 1.], [2., 0.]] features = numpy.array(features, dtype=numpy.float32) labels = [1, 1, 1] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = 1 model_onnx = convert_sparkml(model, 'Sparkml Decision Tree One Class', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeBinaryClass") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #6
Source File: test_scaler.py From onnxmltools with MIT License | 6 votes |
def test_minmax_scaler(self): data = self.spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), (2, Vectors.dense([3.0, 10.1, 3.0]),) ], ["id", "features"]) scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml MinMaxScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMinMaxScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #7
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint( inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
Example #8
Source File: test_polynomial_expansion.py From onnxmltools with MIT License | 6 votes |
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame([ (Vectors.dense([1.2, 3.2, 1.3, -5.6]),), (Vectors.dense([4.3, -3.2, 5.7, 1.0]),), (Vectors.dense([0, 3.2, 4.7, -8.9]),) ], ["dense"]) model = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml(model, 'Sparkml PolynomialExpansion', [('dense', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().expanded.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().dense.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPolynomialExpansion") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['expanded'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #9
Source File: test_decision_tree_classifier.py From onnxmltools with MIT License | 6 votes |
def test_tree_binary_classification(self): features = [[0, 1], [1, 1], [2, 0]] features = numpy.array(features, dtype=numpy.float32) labels = [0, 1, 0] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = 2 model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Binary Class', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeBinaryClass") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #10
Source File: test_PCA.py From onnxmltools with MIT License | 6 votes |
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame([ (Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),) ], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml(model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().pca_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #11
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def zero_features(df, *feature_names): """Zero out features in the feature vector. Parameters ---------- df : pyspark.sql.DataFrame feature_names : list of str Returns ------- pyspark.sql.DataFrame """ features = df.schema['features'].metadata['features'] idxs = [features.index(name) for name in feature_names] def zero_features(feat): raw = feat.toArray() for idx in idxs: raw[idx] = 0. return Vectors.dense(raw) zero_features_udf = F.udf(zero_features, VectorUDT()) return df.withColumn('features', mjolnir.spark.add_meta( df._sc, zero_features_udf('features'), {'features': features}))
Example #12
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def append_features(df, *cols): """Append features from columns to the features vector. Parameters ---------- df : pyspark.sql.DataFrame cols : list of str Returns ------- pyspark.sql.DataFrame """ def add_features(feat, *other): raw = feat.toArray() return Vectors.dense(np.append(raw, list(map(float, other)))) add_features_udf = F.udf(add_features, VectorUDT()) new_feat_list = df.schema['features'].metadata['features'] + cols return df.withColumn('features', mjolnir.spark.add_meta( df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
Example #13
Source File: test_gbt_regressor.py From onnxmltools with MIT License | 6 votes |
def test_gbt_regressor(self): data = self.spark.createDataFrame([ (1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])) ], ["label", "features"]) gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml GBTRegressor', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #14
Source File: test_normalizer.py From onnxmltools with MIT License | 6 votes |
def test_model_normalizer_1(self): data = self.spark.createDataFrame([ (0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) ]).toDF("id", "features") model = Normalizer(inputCol='features', outputCol='norm_feature', p=1.0) model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().norm_feature.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #15
Source File: test_decision_tree_classifier.py From onnxmltools with MIT License | 6 votes |
def test_tree_multiple_classification(self): features = [[0, 1], [1, 1], [2, 0], [0.5, 0.5], [1.1, 1.1], [2.1, 0.1]] features = numpy.array(features, dtype=numpy.float32) labels = [0, 1, 2, 1, 1, 2] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = 2 model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Multi Class', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeMultiClass") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #16
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel tvsModelPath = temp_path + "/tvsModel" lrModel.save(tvsModelPath) loadedLrModel = LogisticRegressionModel.load(tvsModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
Example #17
Source File: test_decision_tree_regressor.py From onnxmltools with MIT License | 6 votes |
def test_decision_tree_regressor(self): features = [[0, 1], [1, 1], [2, 0]] features = numpy.array(features, dtype=numpy.float32) labels = [100, -10, 50] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeRegressor(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Regressor', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #18
Source File: test_vector_indexer.py From onnxmltools with MIT License | 6 votes |
def test_model_vector_indexer_single(self): vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed") data = self.spark.createDataFrame([ (Vectors.dense([-1.0]),), (Vectors.dense([0.0]),), (Vectors.dense([0.0]),)], ["a"] ) model = vi.fit(data) model_onnx = convert_sparkml(model, 'Sparkml VectorIndexer Single', [ ('a', FloatTensorType([1, model.numFeatures])) ], target_opset=9) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().indexed.apply(lambda x: pandas.Series(x.toArray())).values data_np = data.toPandas().a.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorIndexerSingle") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexed'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #19
Source File: test_naive_bayes.py From onnxmltools with MIT License | 6 votes |
def test_naive_bayes_multinomial(self): data = self.spark.createDataFrame([ Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") model = nb.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml(model, 'Sparkml NaiveBayes Multinomial', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNaiveBayesMultinomial") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #20
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvs.setParallelism(1) tvsSerialModel = tvs.fit(dataset) tvs.setParallelism(2) tvsParallelModel = tvs.fit(dataset) self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
Example #21
Source File: test_naive_bayes.py From onnxmltools with MIT License | 6 votes |
def test_naive_bayes_bernoulli(self): data = self.spark.createDataFrame([ Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) nb = NaiveBayes(smoothing=1.0, modelType="bernoulli", weightCol="weight") model = nb.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml(model, 'Sparkml NaiveBayes Bernoulli', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNaiveBayesBernoulli") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #22
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_linear_regression_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1) model = lr.fit(df) path = tempfile.mkdtemp() lr_path = path + "/lr-pmml" model.write().format("pmml").save(lr_path) pmml_text_list = self.sc.textFile(lr_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
Example #23
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
Example #24
Source File: test_linear_regressor.py From onnxmltools with MIT License | 6 votes |
def test_model_linear_regression_basic(self): data = self.spark.createDataFrame([ (1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], [])) ], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") model = lr.fit(data) # the name of the input is 'features' C = model.numFeatures model_onnx = convert_sparkml(model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([1, C]))]) self.assertTrue(model_onnx is not None) # run the model import pandas predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlLinearRegressor_Basic") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #25
Source File: test_vector_slicer.py From onnxmltools with MIT License | 6 votes |
def test_vector_slicer(self): data = self.spark.createDataFrame([ (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ), (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ), (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"]) model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) feature_count = data.first()[0].array.size model_onnx = convert_sparkml(model, 'Sparkml VectorSlicer', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().sliced.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #26
Source File: test_chi_sql_selector.py From onnxmltools with MIT License | 6 votes |
def test_chi_sq_selector(self): data = self.spark.createDataFrame([ (Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0) ], ["features", "label"]) selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") model = selector.fit(data) print(model.selectedFeatures) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml(model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().selectedFeatures.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #27
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
Example #28
Source File: converter.py From spark-sklearn with Apache License 2.0 | 6 votes |
def _toSparkGLM(self, model): """ Private method for converting a GLM to a Spark model TODO: Add model parameters as well. """ skl_cls = type(model) py_cls = self._skl2spark_classes[skl_cls].py jvm_cls_name = self._skl2spark_classes[skl_cls].jvm intercept = model.intercept_ weights = model.coef_ if len(np.shape(weights)) == 1\ or (len(np.shape(weights)) == 2 and np.shape(weights)[0] == 1): # Binary classification uid = _randomUID(skl_cls) _java_model = _new_java_obj(self.sc, jvm_cls_name, uid, Vectors.dense(weights), float(intercept)) return py_cls(_java_model) elif len(np.shape(weights)) == 2 and skl_cls == SKL_LogisticRegression: # Multiclass label raise ValueError("Converter.toSpark cannot convert a multiclass sklearn Logistic" + " Regression model to Spark because Spark does not yet support" + " multiclass. Given model is for %d classes." % np.shape(weights)[0]) else: raise Exception("Converter.toSpark experienced unknown error when trying to convert" + " a model of type: " + type(model) + " " + len(np.shape(weights)))
Example #29
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_raw_and_probability_prediction(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, seed=123) model = mlp.fit(df) test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF() result = model.transform(test).head() expected_prediction = 2.0 expected_probability = [0.0, 0.0, 1.0] expected_rawPrediction = [57.3955, -124.5462, 67.9943] self.assertTrue(result.prediction, expected_prediction) self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4)) self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
Example #30
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_multinomial_logistic_regression_with_bound(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) lor = LogisticRegression(regParam=0.01, lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) model = lor.fit(df) expected = [[4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0], [3.041, 5.0, 8.0, 11.0]] for i in range(0, len(expected)): self.assertTrue( np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) self.assertTrue( np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))