Python pyspark.ml.linalg.Vectors.sparse() Examples

The following are 21 code examples of pyspark.ml.linalg.Vectors.sparse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.ml.linalg.Vectors , or try the search function .
Example #1
Source File: test_gbt_regressor.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_gbt_regressor(self):
        data = self.spark.createDataFrame([
            (1.0, Vectors.dense(1.0)),
            (0.0, Vectors.sparse(1, [], []))
        ], ["label", "features"])
        gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
        model = gbt.fit(data)
        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml GBTRegressor', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlGBTRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #2
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_linear_regression_pmml_basic(self):
        # Most of the validation is done in the Scala side, here we just check
        # that we output text rather than parquet (e.g. that the format flag
        # was respected).
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1)
        model = lr.fit(df)
        path = tempfile.mkdtemp()
        lr_path = path + "/lr-pmml"
        model.write().format("pmml").save(lr_path)
        pmml_text_list = self.sc.textFile(lr_path).collect()
        pmml_text = "\n".join(pmml_text_list)
        self.assertIn("Apache Spark", pmml_text)
        self.assertIn("PMML", pmml_text) 
Example #3
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_onevsrest(self):
        temp_path = tempfile.mkdtemp()
        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                         (1.0, Vectors.sparse(2, [], [])),
                                         (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                        ["label", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(df)
        ovrPath = temp_path + "/ovr"
        ovr.save(ovrPath)
        loadedOvr = OneVsRest.load(ovrPath)
        self._compare_pipelines(ovr, loadedOvr)
        modelPath = temp_path + "/ovrModel"
        model.save(modelPath)
        loadedModel = OneVsRestModel.load(modelPath)
        self._compare_pipelines(model, loadedModel) 
Example #4
Source File: test_PCA.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame([
            (Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)
        ], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #5
Source File: test_aft_survival_regression.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_aft_regression_survival(self):
        data = self.spark.createDataFrame([
            (1.0, Vectors.dense(1.0), 1.0),
            (1e-40, Vectors.sparse(1, [], []), 0.0)
        ], ["label", "features", "censor"])
        gbt = AFTSurvivalRegression()
        model = gbt.fit(data)
        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml AFTSurvivalRegression', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlAFTSurvivalRegression")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #6
Source File: test_linear_regressor.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_model_linear_regression_basic(self):
        data = self.spark.createDataFrame([
            (1.0, 2.0, Vectors.dense(1.0)),
            (0.0, 2.0, Vectors.sparse(1, [], []))
        ], ["label", "weight", "features"])
        lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
        model = lr.fit(data)
        # the name of the input is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([1, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        import pandas
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlLinearRegressor_Basic")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #7
Source File: tests.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def test_gaussian_mixture_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        gmm = GaussianMixture(k=2)
        model = gmm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertTrue(isinstance(s.probability, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 3) 
Example #8
Source File: dl_runner.py    From sparkflow with MIT License 6 votes vote down vote up
def test_small_sparse(self):
        xor = [(0.0, Vectors.sparse(2,[0,1],[0.0,0.0])),
               (0.0, Vectors.sparse(2,[0,1],[1.0,1.0])),
               (1.0, Vectors.sparse(2,[0],[1.0])),
               (1.0, Vectors.sparse(2,[1],[1.0]))]
        processed = self.spark.createDataFrame(xor, ["label", "features"])

        mg=build_graph(SparkFlowTests.create_model)
        spark_model = SparkAsyncDL(
            inputCol='features',
            tensorflowGraph=mg,
            tfInput='x:0',
            tfLabel='y:0',
            tfOutput='outer/Sigmoid:0',
            tfOptimizer='adam',
            tfLearningRate=.1,
            iters=35,
            partitions=2,
            predictionCol='predicted',
            labelCol='label'
        )
        assert spark_model.fit(processed).transform(processed).collect() is not None 
Example #9
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_copy(self):
        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                         (1.0, Vectors.sparse(2, [], [])),
                                         (2.0, Vectors.dense(0.5, 0.5))],
                                        ["label", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        ovr1 = ovr.copy({lr.maxIter: 10})
        self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
        self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
        model = ovr.fit(df)
        model1 = model.copy({model.predictionCol: "indexed"})
        self.assertEqual(model1.getPredictionCol(), "indexed") 
Example #10
Source File: test_min_hash_lsh.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_min_hash_lsh(self):
        data = self.spark.createDataFrame([
            (0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
            (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
            (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)
        ], ["id", "features"])
        mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
        model = mh.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml MinHashLSH', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data.limit(1))
        data_np = data.limit(1).toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().hashes.apply(lambda x: pandas.Series(x)
                                              .map(lambda y: y.values[0])).values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlMinHashLSH")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['hashes'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #11
Source File: converter_test.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def ztest_toPandas(self):
        data = [(Vectors.dense([0.1, 0.2]),),
                (Vectors.sparse(2, {0: 0.3, 1: 0.4}),),
                (Vectors.sparse(2, {0: 0.5, 1: 0.6}),)]
        df = self.sql.createDataFrame(data, ["features"])
        self.assertEqual(df.count(), 3)
        pd = self.converter.toPandas(df)
        self.assertEqual(len(pd), 3)
        self.assertTrue(isinstance(pd.features[0], csr_matrix),
                        "Expected pd.features[0] to be csr_matrix but found: %s" %
                        type(pd.features[0]))
        self.assertEqual(pd.features[0].shape[0], 3)
        self.assertEqual(pd.features[0].shape[1], 2)
        self.assertEqual(pd.features[0][0, 0], 0.1)
        self.assertEqual(pd.features[0][0, 1], 0.2) 
Example #12
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_infer_schema(self):
        rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
        df = rdd.toDF()
        schema = df.schema
        self.assertTrue(schema.fields[1].dataType, self.udt)
        matrices = df.rdd.map(lambda x: x._2).collect()
        self.assertEqual(len(matrices), 2)
        for m in matrices:
            if isinstance(m, DenseMatrix):
                self.assertTrue(m, self.dm1)
            elif isinstance(m, SparseMatrix):
                self.assertTrue(m, self.sm1)
            else:
                raise ValueError("Expected a matrix but got type %r" % type(m)) 
Example #13
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_support_for_weightCol(self):
        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                         (1.0, Vectors.sparse(2, [], []), 1.0),
                                         (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                        ["label", "features", "weight"])
        # classifier inherits hasWeightCol
        lr = LogisticRegression(maxIter=5, regParam=0.01)
        ovr = OneVsRest(classifier=lr, weightCol="weight")
        self.assertIsNotNone(ovr.fit(df))
        # classifier doesn't inherit hasWeightCol
        dt = DecisionTreeClassifier()
        ovr2 = OneVsRest(classifier=dt, weightCol="weight")
        self.assertIsNotNone(ovr2.fit(df)) 
Example #14
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_parallelism_doesnt_change_output(self):
        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                         (1.0, Vectors.sparse(2, [], [])),
                                         (2.0, Vectors.dense(0.5, 0.5))],
                                        ["label", "features"])
        ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
        modelPar1 = ovrPar1.fit(df)
        ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
        modelPar2 = ovrPar2.fit(df)
        for i, model in enumerate(modelPar1.models):
            self.assertTrue(np.allclose(model.coefficients.toArray(),
                                        modelPar2.models[i].coefficients.toArray(), atol=1E-4))
            self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4)) 
Example #15
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_bisecting_kmeans_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        bkm = BisectingKMeans(k=2)
        model = bkm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 20) 
Example #16
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_multiclass_logistic_regression_summary(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], [])),
                                         (2.0, 2.0, Vectors.dense(2.0)),
                                         (2.0, 2.0, Vectors.dense(1.9))],
                                        ["label", "weight", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
        model = lr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertEqual(s.labelCol, "label")
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        objHist = s.objectiveHistory
        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
        self.assertGreater(s.totalIterations, 0)
        self.assertTrue(isinstance(s.labels, list))
        self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.precisionByLabel, list))
        self.assertTrue(isinstance(s.recallByLabel, list))
        self.assertTrue(isinstance(s.fMeasureByLabel(), list))
        self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
        self.assertAlmostEqual(s.accuracy, 0.75, 2)
        self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
        self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
        self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
        self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
        self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
        self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned, Scala version runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) 
Example #17
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_binary_logistic_regression_summary(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
        model = lr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertEqual(s.labelCol, "label")
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        objHist = s.objectiveHistory
        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
        self.assertGreater(s.totalIterations, 0)
        self.assertTrue(isinstance(s.labels, list))
        self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.precisionByLabel, list))
        self.assertTrue(isinstance(s.recallByLabel, list))
        self.assertTrue(isinstance(s.fMeasureByLabel(), list))
        self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
        self.assertTrue(isinstance(s.roc, DataFrame))
        self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
        self.assertTrue(isinstance(s.pr, DataFrame))
        self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
        self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
        self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
        self.assertAlmostEqual(s.accuracy, 1.0, 2)
        self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
        self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
        self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
        self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
        self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
        self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned, Scala version runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) 
Example #18
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_linear_regression_summary(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)
        model = lr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertGreater(s.totalIterations, 0)
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.predictionCol, "prediction")
        self.assertEqual(s.labelCol, "label")
        self.assertEqual(s.featuresCol, "features")
        objHist = s.objectiveHistory
        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
        self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
        self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
        self.assertAlmostEqual(s.meanSquaredError, 0.0)
        self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
        self.assertAlmostEqual(s.r2, 1.0, 2)
        self.assertAlmostEqual(s.r2adj, 1.0, 2)
        self.assertTrue(isinstance(s.residuals, DataFrame))
        self.assertEqual(s.numInstances, 2)
        self.assertEqual(s.degreesOfFreedom, 1)
        devResiduals = s.devianceResiduals
        self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
        coefStdErr = s.coefficientStandardErrors
        self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
        tValues = s.tValues
        self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
        pValues = s.pValues
        self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned
        # The child class LinearRegressionTrainingSummary runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance) 
Example #19
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_persistence(self):
        # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
        df = self.spark.createDataFrame([
            [1, Vectors.dense([0.0, 1.0])],
            [2, Vectors.sparse(2, {0: 1.0})],
        ], ["id", "features"])
        # Fit model
        lda = LDA(k=2, seed=1, optimizer="em")
        distributedModel = lda.fit(df)
        self.assertTrue(distributedModel.isDistributed())
        localModel = distributedModel.toLocal()
        self.assertFalse(localModel.isDistributed())
        # Define paths
        path = tempfile.mkdtemp()
        lda_path = path + "/lda"
        dist_model_path = path + "/distLDAModel"
        local_model_path = path + "/localLDAModel"
        # Test LDA
        lda.save(lda_path)
        lda2 = LDA.load(lda_path)
        self._compare(lda, lda2)
        # Test DistributedLDAModel
        distributedModel.save(dist_model_path)
        distributedModel2 = DistributedLDAModel.load(dist_model_path)
        self._compare(distributedModel, distributedModel2)
        # Test LocalLDAModel
        localModel.save(local_model_path)
        localModel2 = LocalLDAModel.load(local_model_path)
        self._compare(localModel, localModel2)
        # Clean up
        try:
            rmtree(path)
        except OSError:
            pass 
Example #20
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString() 
Example #21
Source File: tests.py    From LearningApacheSpark with MIT License 4 votes vote down vote up
def test_sparse_matrix(self):
        # Test sparse matrix creation.
        sm1 = SparseMatrix(
            3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
        self.assertEqual(sm1.numRows, 3)
        self.assertEqual(sm1.numCols, 4)
        self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
        self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
        self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
        self.assertTrue(
            repr(sm1),
            'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')

        # Test indexing
        expected = [
            [0, 0, 0, 0],
            [1, 0, 4, 0],
            [2, 0, 5, 0]]

        for i in range(3):
            for j in range(4):
                self.assertEqual(expected[i][j], sm1[i, j])
        self.assertTrue(array_equal(sm1.toArray(), expected))

        for i, j in [(-1, 1), (4, 3), (3, 5)]:
            self.assertRaises(IndexError, sm1.__getitem__, (i, j))

        # Test conversion to dense and sparse.
        smnew = sm1.toDense().toSparse()
        self.assertEqual(sm1.numRows, smnew.numRows)
        self.assertEqual(sm1.numCols, smnew.numCols)
        self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
        self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
        self.assertTrue(array_equal(sm1.values, smnew.values))

        sm1t = SparseMatrix(
            3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
            isTransposed=True)
        self.assertEqual(sm1t.numRows, 3)
        self.assertEqual(sm1t.numCols, 4)
        self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
        self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
        self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])

        expected = [
            [3, 2, 0, 0],
            [0, 0, 4, 0],
            [9, 0, 8, 0]]

        for i in range(3):
            for j in range(4):
                self.assertEqual(expected[i][j], sm1t[i, j])
        self.assertTrue(array_equal(sm1t.toArray(), expected))