Python pyspark.ml.linalg.Vectors.dense() Examples

The following are 30 code examples of pyspark.ml.linalg.Vectors.dense(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.ml.linalg.Vectors , or try the search function .
Example #1
Source File: test_scaler.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_maxabs_scaler(self):
        data = self.spark.createDataFrame([
            (0, Vectors.dense([1.0, 0.1, -1.0]),),
            (1, Vectors.dense([2.0, 1.1, 1.0]),),
            (2, Vectors.dense([3.0, 10.1, 3.0]),)
        ], ["id", "features"])
        scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #2
Source File: test_normalizer.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_normalizer_2(self):
        data = self.spark.createDataFrame([
          (0, Vectors.dense(1.0, 0.5, -1.0)),
          (1, Vectors.dense(2.0, 1.0, 1.0)),
          (2, Vectors.dense(4.0, 10.0, 2.0))
        ]).toDF("id", "features")
        model = Normalizer(inputCol='features', outputCol='norm_feature', p=2.0)

        model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().norm_feature.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #3
Source File: test_aft_survival_regression.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_aft_regression_survival(self):
        data = self.spark.createDataFrame([
            (1.0, Vectors.dense(1.0), 1.0),
            (1e-40, Vectors.sparse(1, [], []), 0.0)
        ], ["label", "features", "censor"])
        gbt = AFTSurvivalRegression()
        model = gbt.fit(data)
        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml AFTSurvivalRegression', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlAFTSurvivalRegression")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #4
Source File: test_spark_dataset_converter.py    From petastorm with Apache License 2.0 6 votes vote down vote up
def test_vector_to_array(spark_test_ctx):
    from pyspark.ml.linalg import Vectors
    from pyspark.mllib.linalg import Vectors as OldVectors
    df = spark_test_ctx.spark.createDataFrame([
        (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
        (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))
    ], ["vec", "oldVec"])
    converter1 = make_spark_converter(df)
    with converter1.make_tf_dataset(num_epochs=1) as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.vec.dtype.type
    assert np.float32 == ts.oldVec.dtype.type
    vec_col = ts.vec[ts.vec[:, 0].argsort()]
    old_vec_col = ts.oldVec[ts.oldVec[:, 0].argsort()]
    assert (2, 3) == ts.vec.shape
    assert (2, 3) == ts.oldVec.shape
    assert ([1., 2., 3.] == vec_col[0]).all() and \
           ([5., 6., 7.] == vec_col[1]).all()
    assert ([10., 20., 30.] == old_vec_col[0]).all() and \
           ([50., 60., 70] == old_vec_col[1]).all() 
Example #5
Source File: test_decision_tree_classifier.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_tree_one_class_classification(self):
        features = [[0., 1.], [1., 1.], [2., 0.]]
        features = numpy.array(features, dtype=numpy.float32)
        labels = [1, 1, 1]
        dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))]
        data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"])
        dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
        model = dt.fit(data)
        feature_count = 1
        model_onnx = convert_sparkml(model, 'Sparkml Decision Tree One Class', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlDecisionTreeBinaryClass")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #6
Source File: test_scaler.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_minmax_scaler(self):
        data = self.spark.createDataFrame([
            (0, Vectors.dense([1.0, 0.1, -1.0]),),
            (1, Vectors.dense([2.0, 1.1, 1.0]),),
            (2, Vectors.dense([3.0, 10.1, 3.0]),)
        ], ["id", "features"])
        scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml MinMaxScaler', [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMinMaxScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #7
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected) 
Example #8
Source File: test_polynomial_expansion.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame([
            (Vectors.dense([1.2, 3.2, 1.3, -5.6]),),
            (Vectors.dense([4.3, -3.2, 5.7, 1.0]),),
            (Vectors.dense([0, 3.2, 4.7, -8.9]),)
        ], ["dense"])
        model = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded")

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(model, 'Sparkml PolynomialExpansion', [('dense', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().expanded.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().dense.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPolynomialExpansion")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['expanded'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #9
Source File: test_decision_tree_classifier.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_tree_binary_classification(self):
        features = [[0, 1], [1, 1], [2, 0]]
        features = numpy.array(features, dtype=numpy.float32)
        labels = [0, 1, 0]
        dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))]
        data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"])
        dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
        model = dt.fit(data)
        feature_count = 2
        model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Binary Class', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlDecisionTreeBinaryClass")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #10
Source File: test_PCA.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame([
            (Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)
        ], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #11
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features})) 
Example #12
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list})) 
Example #13
Source File: test_gbt_regressor.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_gbt_regressor(self):
        data = self.spark.createDataFrame([
            (1.0, Vectors.dense(1.0)),
            (0.0, Vectors.sparse(1, [], []))
        ], ["label", "features"])
        gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
        model = gbt.fit(data)
        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml GBTRegressor', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlGBTRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #14
Source File: test_normalizer.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_normalizer_1(self):
        data = self.spark.createDataFrame([
          (0, Vectors.dense(1.0, 0.5, -1.0)),
          (1, Vectors.dense(2.0, 1.0, 1.0)),
          (2, Vectors.dense(4.0, 10.0, 2.0))
        ]).toDF("id", "features")
        model = Normalizer(inputCol='features', outputCol='norm_feature', p=1.0)

        model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().norm_feature.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #15
Source File: test_decision_tree_classifier.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_tree_multiple_classification(self):
        features = [[0, 1], [1, 1], [2, 0], [0.5, 0.5], [1.1, 1.1], [2.1, 0.1]]
        features = numpy.array(features, dtype=numpy.float32)
        labels = [0, 1, 2, 1, 1, 2]
        dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))]
        data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"])
        dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
        model = dt.fit(data)
        feature_count = 2
        model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Multi Class', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlDecisionTreeMultiClass")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #16
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        tvsModelPath = temp_path + "/tvsModel"
        lrModel.save(tvsModelPath)
        loadedLrModel = LogisticRegressionModel.load(tvsModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept) 
Example #17
Source File: test_decision_tree_regressor.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_decision_tree_regressor(self):
        features = [[0, 1], [1, 1], [2, 0]]
        features = numpy.array(features, dtype=numpy.float32)
        labels = [100, -10, 50]
        dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))]
        data = self.spark.createDataFrame(self.spark.sparkContext.parallelize(dd), schema=["label", "features"])
        dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
        model = dt.fit(data)
        feature_count = data.select('features').first()[0].size
        model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Regressor', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlDecisionTreeRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #18
Source File: test_vector_indexer.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_vector_indexer_single(self):
        vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed")
        data = self.spark.createDataFrame([
            (Vectors.dense([-1.0]),),
            (Vectors.dense([0.0]),),
            (Vectors.dense([0.0]),)],
            ["a"]
        )
        model = vi.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml VectorIndexer Single',  [
            ('a', FloatTensorType([1, model.numFeatures]))
        ], target_opset=9)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().indexed.apply(lambda x: pandas.Series(x.toArray())).values
        data_np = data.toPandas().a.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlVectorIndexerSingle")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexed'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #19
Source File: test_naive_bayes.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_naive_bayes_multinomial(self):
        data = self.spark.createDataFrame([
            Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
            Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
            Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))])
        nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
        model = nb.fit(data)
        feature_count = data.select('features').first()[0].size
        model_onnx = convert_sparkml(model, 'Sparkml NaiveBayes Multinomial',
                                     [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
            ]
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNaiveBayesMultinomial")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #20
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvs.setParallelism(1)
        tvsSerialModel = tvs.fit(dataset)
        tvs.setParallelism(2)
        tvsParallelModel = tvs.fit(dataset)
        self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics) 
Example #21
Source File: test_naive_bayes.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_naive_bayes_bernoulli(self):
        data = self.spark.createDataFrame([
            Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
            Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
            Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))])
        nb = NaiveBayes(smoothing=1.0, modelType="bernoulli", weightCol="weight")
        model = nb.fit(data)
        feature_count = data.select('features').first()[0].size
        model_onnx = convert_sparkml(model, 'Sparkml NaiveBayes Bernoulli',
                                     [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
            ]
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNaiveBayesBernoulli")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #22
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_linear_regression_pmml_basic(self):
        # Most of the validation is done in the Scala side, here we just check
        # that we output text rather than parquet (e.g. that the format flag
        # was respected).
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1)
        model = lr.fit(df)
        path = tempfile.mkdtemp()
        lr_path = path + "/lr-pmml"
        model.write().format("pmml").save(lr_path)
        pmml_text_list = self.sc.textFile(lr_path).collect()
        pmml_text = "\n".join(pmml_text_list)
        self.assertIn("Apache Spark", pmml_text)
        self.assertIn("PMML", pmml_text) 
Example #23
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_onevsrest(self):
        temp_path = tempfile.mkdtemp()
        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                         (1.0, Vectors.sparse(2, [], [])),
                                         (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                        ["label", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(df)
        ovrPath = temp_path + "/ovr"
        ovr.save(ovrPath)
        loadedOvr = OneVsRest.load(ovrPath)
        self._compare_pipelines(ovr, loadedOvr)
        modelPath = temp_path + "/ovrModel"
        model.save(modelPath)
        loadedModel = OneVsRestModel.load(modelPath)
        self._compare_pipelines(model, loadedModel) 
Example #24
Source File: test_linear_regressor.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_linear_regression_basic(self):
        data = self.spark.createDataFrame([
            (1.0, 2.0, Vectors.dense(1.0)),
            (0.0, 2.0, Vectors.sparse(1, [], []))
        ], ["label", "weight", "features"])
        lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
        model = lr.fit(data)
        # the name of the input is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([1, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        import pandas
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlLinearRegressor_Basic")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #25
Source File: test_vector_slicer.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_vector_slicer(self):
        data = self.spark.createDataFrame([
            (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ),
            (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ),
            (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"])
        model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])

        feature_count = data.first()[0].array.size
        model_onnx = convert_sparkml(model, 'Sparkml VectorSlicer',
                                     [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().sliced.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #26
Source File: test_chi_sql_selector.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_chi_sq_selector(self):
        data = self.spark.createDataFrame([
            (Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
            (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
            (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)
        ], ["features", "label"])
        selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")
        model = selector.fit(data)
        print(model.selectedFeatures)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().selectedFeatures.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #27
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_gaussian_mixture_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        gmm = GaussianMixture(k=2)
        model = gmm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertTrue(isinstance(s.probability, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 3) 
Example #28
Source File: converter.py    From spark-sklearn with Apache License 2.0 6 votes vote down vote up
def _toSparkGLM(self, model):
        """ Private method for converting a GLM to a Spark model
        TODO: Add model parameters as well.
        """
        skl_cls = type(model)
        py_cls = self._skl2spark_classes[skl_cls].py
        jvm_cls_name = self._skl2spark_classes[skl_cls].jvm
        intercept = model.intercept_
        weights = model.coef_
        if len(np.shape(weights)) == 1\
                or (len(np.shape(weights)) == 2 and np.shape(weights)[0] == 1):
            # Binary classification
            uid = _randomUID(skl_cls)
            _java_model = _new_java_obj(self.sc, jvm_cls_name, uid, Vectors.dense(weights), float(intercept))
            return py_cls(_java_model)
        elif len(np.shape(weights)) == 2 and skl_cls == SKL_LogisticRegression:
            # Multiclass label
            raise ValueError("Converter.toSpark cannot convert a multiclass sklearn Logistic" +
                             " Regression model to Spark because Spark does not yet support" +
                             " multiclass.  Given model is for %d classes." %
                             np.shape(weights)[0])
        else:
            raise Exception("Converter.toSpark experienced unknown error when trying to convert" +
                            " a model of type: " + type(model) + "  " + len(np.shape(weights))) 
Example #29
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_raw_and_probability_prediction(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
                                             blockSize=128, seed=123)
        model = mlp.fit(df)
        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
        result = model.transform(test).head()
        expected_prediction = 2.0
        expected_probability = [0.0, 0.0, 1.0]
        expected_rawPrediction = [57.3955, -124.5462, 67.9943]
        self.assertTrue(result.prediction, expected_prediction)
        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
        self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4)) 
Example #30
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(regParam=0.01,
                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904],
                    [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))